diff --git a/.circleci/config.yml b/.circleci/config.yml
index 18233fd872..6404b6bb28 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -2,39 +2,52 @@ version: 2
 jobs:
   build:
     docker:
-      - image: danieluranga/leela_chess_zero-lc0_ubuntu_builder:0.0.8
+      - image: nvidia/cuda:11.6.1-cudnn8-devel-ubuntu20.04
     steps:
       - checkout
       - run:
-          name: "Pull Submodules"
-          command: git submodule update --init
+          name: Install build tools
+          command: |
+            apt-get update
+            apt-get -y install git python3-pip gcc-10 g++-10 clang-12 zlib1g zlib1g-dev wget
+            pip3 install meson==0.63
+            pip3 install ninja
       - run:
-          name: Update Meson
-          command: pip3 install --upgrade meson==0.58.1
+          name: Install onnxruntime
+          command: |
+            wget https://github.com/microsoft/onnxruntime/releases/download/v1.22.0/onnxruntime-linux-x64-1.22.0.tgz -P /tmp
+            tar xzf /tmp/onnxruntime-linux-x64-1.22.0.tgz -C /tmp
       - run:
           name: Meson GCC
           environment:
-            CC: gcc-8
-            CXX: g++-8
-          command: meson build-gcc -Dgtest=false
+            CC: gcc-10
+            CXX: g++-10
+          command: meson build-gcc -Dgtest=false -Donnx_include=/tmp/onnxruntime-linux-x64-1.22.0/include -Donnx_libdir=/tmp/onnxruntime-linux-x64-1.22.0/lib
+      - run:
+          name: Meson Clang
+          environment:
+            CC: clang-12
+            CXX: clang++-12
+          command: meson build-clang -Dgtest=false -Db_lto=false -Donnx_include=/tmp/onnxruntime-linux-x64-1.22.0/include -Donnx_libdir=/tmp/onnxruntime-linux-x64-1.22.0/lib
       - run:
           name: Build GCC
           command: |
             cd build-gcc
             ninja -j 4
+      - run:
+          name: Build Clang
+          command: |
+            cd build-clang
+            ninja -j 4
   "mac":
     macos:
-      xcode: 14.1.0
-    resource_class: macos.m1.medium.gen1
+      xcode: 14.3.1
     steps:
       - checkout
-      - run:
-          name: "Pull Submodules"
-          command: git submodule update --init
       - run:
           name: Install build tools
           command: |
-            pip3 install meson==0.63
+            pip3 install meson==1.3.0
             pip3 install ninja
             curl -LJOs https://github.com/ispc/ispc/releases/download/v1.21.0/ispc-v1.21.0-macOS.universal.tar.gz
             tar xf ispc-v1.21.0-macOS.universal.tar.gz
@@ -56,10 +69,68 @@ jobs:
           command: lipo -create -o /tmp/lc0 build/lc0 build-arm/lc0
       - store_artifacts:
           path: /tmp/lc0
-          destination: lc0-macos_12.6.1
+          destination: lc0-macos_13.2.1
+      - run:
+          name: Prepare Workspace
+          command: |
+            mkdir -p workspace
+            mv /tmp/lc0 workspace
+      - persist_to_workspace:
+          root: workspace
+          paths: 
+            - lc0
+  "mac latest":
+    macos:
+      xcode: 26.1.0
+    steps:
+      - checkout
+      - run:
+          name: Install build tools
+          command: |
+            pip3 install meson
+            pip3 install ninja
+      - run:
+          name: Build lc0 arm
+          command: |
+            meson build-arm --buildtype=release -Dgtest=false -Dopencl=false
+            cd build-arm
+            ninja
+  "upload-github-release":
+    macos:
+      xcode: 14.3.1
+    steps:
+      - attach_workspace:
+          at: /tmp/workspace
+      - run:
+          name: Install GitHub CLI
+          command: brew install gh
+      - run:
+          name: Verify Workspace
+          command: |
+            ls -lah /tmp/workspace
+      - run:
+          name: Upload to GitHub Release
+          command: |
+            mv /tmp/workspace/lc0 /tmp/lc0-$CIRCLE_TAG-macos_13.2.1
+            gh release upload \
+              "$CIRCLE_TAG" \
+              /tmp/lc0-$CIRCLE_TAG-macos_13.2.1 \
+              --clobber --repo LeelaChessZero/lc0
 workflows:
   version: 2
   builds:
     jobs:
       - build
-      - "mac"
+      - "mac":
+          filters:
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*(\-.+)?/
+      - "mac latest"
+      - "upload-github-release":
+          requires:
+            - "mac"
+          filters:
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*(\-.+)?/
+            branches:
+              ignore: /.*/
diff --git a/.gitignore b/.gitignore
index ea18c9ee58..ae0570869a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,7 @@
 .cache/
 .clangd/
 build/
+builddir/
 __pycache__/
 compile_commands.json
 CUDA_NN/
diff --git a/.gitmodules b/.gitmodules
index 6575e63266..e69de29bb2 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +0,0 @@
-[submodule "libs/lczero-common"]
-	path = libs/lczero-common
-	url = https://github.com/LeelaChessZero/lczero-common.git
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000000..b7d6010ae3
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,129 @@
+A-childs-encyclopedia
+Adam Treat
+Alex Greason
+Alexander Lyashuk
+Alexis Olson
+alice
+almaudoh
+Aloril
+Andrew Grant
+Andy Olsen
+Aniebiet Udoh
+Ankan
+Ankan Banerjee
+Anson Hu
+Asger Alstrup Palm
+baum
+benini
+borg323
+Boštjan Mejak
+Brandon Lin
+Brett Holman
+Carlo Wood
+Chin-Chang Yang
+cn4750
+Cong
+Contrad Namiseb (Bonan)
+Copilot (bot)
+cwbriscoe
+danegraphics
+Daniel Monroe
+Daniel Uranga
+Dieter Dobbelaere
+dje-dev
+Dominik Schlösser
+DU-jdto
+dubslow
+Dubslow
+Ed Lee
+Epanek
+Error323
+evalon32
+exa
+exoticorn
+F. Huizinga
+fischerandom
+fohristiwhirl
+Francis
+Francis Li
+Francois
+Francois Pays
+François Pays
+Gabe
+Ganesh Krishnan
+GBeauregard
+Gergely Fülöp
+Gian-Carlo Pascutto
+gmorenz
+Google LLC
+gsobala
+Hace
+Hans Ekbrand
+Henrik Forstén
+Ikko Eltociear Ashimine
+Jack L
+Jack Thomson
+James Horsfall Thomas
+jamie
+jjoshua2
+John Newlin
+john-sp
+Julian-Dominik Helmsen
+Karl Kfoury
+Kathleen Mcgrievy
+kiilas
+Kip Hamiltons
+Kovax
+Leandro Álvarez González
+Linmiao Xu
+Lisa Butterfly
+Luka Rahne
+Marcin Stefaniuk
+Martin
+Martin Senft
+masterkni6
+masterkni666
+Menkib
+Mike Roberts
+Naphthalin
+nathan-lc0
+Neelesh Jain
+nguyenpham
+noobpwnftw
+Ofek Shochat
+oscardssmith
+Pan
+patrik-ha
+PaulJeFi
+Pratik Dixit
+psykose
+QxC4eva
+Rafal Bielski
+Raj
+Reece H. Dunn
+Ron Wolf
+Sami Kiminki
+Sherman Siu
+Shreyas Kapur
+shtayerc
+Shukant Pal
+Simon
+slash
+students
+SunnyWar
+TesseractA
+Tilps
+Timofey Kondrashov
+Ting-Hsuan Huang
+Tony Su
+trre123
+Usman Haider
+uyhcire
+Valentin
+Valeriy Huz
+Victor Popovici
+Videodr0me
+Viet-Anh Tran
+Viren6
+Yan Zhang
+zz4032
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 90c7d4aa63..0249700c0a 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,81 +1,135 @@
-# Contributing to lc0
-
-These are the guidelines and standards followed by this codebase.
-
-The language is C++, specifically C++17. As such, manual `new` and `delete` memory mangement is strongly discouraged; use the standard library tools for managing memory (such as `unique_ptr`, `shared_ptr` etc.).
-
-This codebase uses semantic versioning. A release is the final commit for that version number, and all subsequent commits are development for the next version. `master` is the default branch, and the active development branch (as such, all Pull Requests go here); it always targets a minor (or major) version which succeeds the current relase. `release` is always equivalent to the latest tag.
-
-
-### Style
-
-Style is of course the first guideline on every new contributor's mind :)
-
-This codebase largely complies with the [Google C++ style guide](https://google.github.io/styleguide/cppguide.html). The maintainers recommend the use of [Clang's auto formatter](https://clang.llvm.org/docs/ClangFormatStyleOptions.html).
-
-Notable exceptions:
- 1. C++ exceptions are allowed (in fact, only `lczero::Exception`, defined in `utils/exception.h`, is allowed)
- 2. We use `#pragma once` instead of header guards.
- 3. Default function parameters are sometimes allowed.
- 4. Rvalue reference function params are sometimes allowed, not only for constructors and assignment operators.
-
-For items (3) and (4), usage of those are discouraged, only use them if they benefit readability or have significant performance gain. It's possible that those exceptions (3) and (4) will be disallowed in future.
-
-The most important rule to follow is consistency: look at the surrounding code when doing changes and follow similar style.
-
-These are the most important parts of the codebase style (as a sort of tl;dr):
-
- * Comments must be full sentences, i.e. capitalized and ending in a period. (Sentences with elided subjects are fine.) Only `//` style comments are allowed, `/* */` style comments aren't.
-
- * Braces are a variant of K&R style, as can be gleaned from existing code. All `if` statements must use braces, with the possible exception of single statement `if`s, which *may* omit if the braces *if* the conditional and following statement are on the same line. Again, see surrounding code for examples.
-
- * Indentation is two spaces; \t characters are disallowed.
-
- * Code line length is strictly capped at 80 characters.
-
- * Using non-`const` references as function parameters is disallowed; use pointers instead. (Using `const` references as parameters is fine.)
-
- * Identifier style:
-   - `kLikeThis` for constants and enum values
-   - `like_this` for variables
-   - `like_this_` for member variables
-   - `LikeThis` for function and class names
-
- * All code should be inside `namespace lczero`
-
-The internal code dependency structure looks like this:
-
- * Code in `src/utils` is not allowed to depend on any other code.
-
- * Code in `src/chess` only depends on `src/utils`
-
- * Code in `src/neural` only depends on `src/utils` and `src/chess`
-
- * Code in `src/mcts` only depends on `src/utils`, `src/chess` and `src/neural`
-
-
-### Git history
-
-Pull Requests are squashed when merged. This means all commits in the branch will be squashed into one commit applied onto master, so branches and their PRs should stick to *one* topic only. If you think changes deserve separate commits, make separate PRs for each commit.
-
-This also means it's not possible to reuse one branch for multiple PRs; new PRs must either use entirely new branches, or else you could use `git reset --hard` on the current branch.
-
-
-### Allowed features
-
-Lc0 is still in early stages of development, and has not yet reached the point where we are ready to add small tweaks to add few points of a rating. Large code changes still happen, and having lots of small optimizations adds overhead to larger changes, slowing development.
-
-Therefore, as a rule, search algorithm tweaks that give a gain of less than ~20 Elo points are discouraged at this point. (This limit will gradually be lowered as Lc0 code matures, eventually to 0.0 Elo).
-
-
-#### Adding new command line flags/UCI parameters
-
-Only add new parameters if users can significantly (>20 Elo) benefit by tweaking it. We don't want to make every single constant configurable (or rather, users don't want to see hundreds of parameters which don't do anything).
-
-Try to minimize number of parameters that your feature introduces. If your feature introduces several parameters, every individual parameter should be significant (i.e. tweaking it with other fixes will give >20 Elo).
-
-
-#### Adding features for testing
-
-It is fine to temporarily commit a feature of unknown Elo gain so that people may test it. It's also fine to expose many parameters for the feature initially so that people can tune them. However, if the tweak doesn't prove to be significant, it should be removed after a few weeks.
-
+# Contributing to Leela Chess Zero (Lc0)
+
+Last updated: June 2025
+
+Thank you for your interest in contributing to LCZero! This document provides
+guidelines for contributing to the codebase.
+
+## Before you start
+
+* All contributors are encouraged to join our Discord server at
+  <https://lc0.org/chat>.
+* Refer to [README.md](README.md) for building and running instructions.
+  * The protobufs that are shared with the training code are located in a
+    separate repository. Don't forget to run `git submodule update --init` to
+    fetch them.
+* Familiarize yourself with the developer documentation at
+  <https://lczero.org/dev/>.
+
+We use the [Meson](https://mesonbuild.com/) build system.
+
+* In Linux, using `builddir/` is recommended for development
+  (`meson setup builddir/`), as VSCode recognizes it and all the development and
+  debugging tools work there (ask in Discord if you have issues).
+  * In the `builddir/`, run `ninja lc0` (which is faster than just `ninja`). Run
+    `ninja test` to run the tests.
+* In Windows, `meson setup build/debug` generates a Visual Studio solution that
+  can be used for development.
+* Check `meson_options.txt` for the available build options (to use them, pass
+  `-D<name>=<value>` to `meson setup`).
+
+## Sending Pull Requests
+
+* We use GitHub for managing contributions. Please fork the repository and
+  create a new branch for your changes.
+* It's encouraged to discuss your changes in the Discord server before
+  starting work.
+  * Small bug fixes are fine to submit without prior discussion.
+  * Large changes that add code rather than modifying existing code (e.g. new
+    backends, new search algorithms) are generally fine as well. Use your best
+    judgement on whether your change may be controversial.
+  * Changes that modify existing code (e.g. search algorithm tweaks, API
+    changes) should be discussed first.
+
+Changes that may affect playing strength **must** be tested.
+
+* Unfortunately, we don't have a robust strength testing framework yet (working
+  on it), so ask in the #testing-discuss channel on Discord for help with
+  testing.
+* Even for Elo-positive changes, we need to balance the strength and
+  maintainability of the code. If your change is Elo-positive but makes the code
+  more complex, please discuss it first. Recently, we added an option to
+  clone the search algorithm in extreme cases.
+* Elo-neutral simplifications are always welcome.
+
+Pull Requests are squashed when merged. This means all commits in the branch
+will be squashed into one commit applied onto master. This makes it tricky to
+reuse the branch and continue to work on it after the PR is merged.
+
+**Note:** This section only applies if you have dependent branches that were
+built on top of your merged PR branch. If you only had one branch, you can
+simply delete it after merging.
+
+**Example scenario:** You had a branch `add-feature` that got merged, and you
+have another branch `extend-feature` that was based on `add-feature`. After
+`add-feature` is merged, you need to rebase `extend-feature` onto the new
+master. Here's what to do after your PR is merged:
+
+```shell
+git fetch upstream  # Update your local master
+git checkout extend-feature  # Switch to your dependent branch
+git rebase --update-refs --onto upstream/master add-feature # Rebase onto the updated master
+```
+
+The `--update-refs` flag will also update any branches between your leaf branch
+and the merged branch if you have any.
+
+## C++ Standard and Libraries
+
+* We use most C++20 features. However, supported compilers are GCC 10 and clang
+  10, so some features may not be available.
+* We use protocol buffers. However, we don't use any external library for it,
+  but rather generate the code from `.proto` files using the script in
+  `scripts/`.
+* Since v0.32, we use Abseil (`absl::`).
+* Use `CERR` for logging (goes to stderr and log), or `LOGFILE` (goes to log
+  file only).
+* Writing tests is encouraged. We use `gtest`/`gmock` for unit tests. Tests are
+  located in the same directory as the code they test, in a file with the same
+  name but ending with `_test.cc`.
+
+## Style Guidelines
+
+We follow the
+[Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html) with
+these modifications:
+
+* **Header guards**: Use `#pragma once` instead of traditional header guards.
+* **Exceptions**: are allowed, but only one: `lczero::Exception`.
+* **References vs Pointers**: Non-const reference parameters are neither
+  encouraged nor discouraged over pointers (in the Google style guide, they used
+  to be discouraged, and now they are encouraged).
+* **Formatting**: Run `clang-format` on all code before committing.
+* **RValue references**: Rvalue reference function parameters are allowed, not
+  only for constructors and assignment operators. However, use them only if they
+  benefit readability or have significant performance gain.
+* Every new file should contain a GPLv3 banner with an additional exception
+  under GNU GPL version 3 section 7 regarding NVIDIA libraries (see examples in
+  existing files).
+  * It's allowed to write new backends without the NVIDIA exception, but that
+    means that we won't be able to include NVIDIA libraries if we link with that
+    code.
+
+## AI-Assisted Development
+
+AI tools and coding assistants are allowed for contributing to Leela Chess Zero,
+provided you follow these guidelines:
+
+* **Disclose AI usage**: Clearly mention in your PR description if you used AI
+  tools, LLMs, or agentic coding approaches (beyond simple code completion).
+* **Maintain code ownership**: You must thoroughly read, understand, and review
+  all AI-generated code in detail before submitting.
+* **Ensure quality**: Take all appropriate steps to verify the code quality,
+  correctness, and adherence to project standards.
+* **Submit only what you could write yourself**: Use AI as a productivity
+  booster, not as a replacement for proper programming skills and domain
+  knowledge.
+* **Core vs. auxiliary code**: While AI assistance works well for auxiliary code
+  (website, documentation, simple tools), it has failed so far on core lc0
+  engine code. We discourage using agentic coding for core engine components but
+  welcome it for simpler projects within the Leela Chess Zero organization.
+
+Remember: You are responsible for the quality and correctness of all code you
+submit, regardless of how it was generated.
+
+Thank you for helping make Leela Chess Zero better!
diff --git a/OpenBench/Makefile b/OpenBench/Makefile
new file mode 100644
index 0000000000..6147fd9125
--- /dev/null
+++ b/OpenBench/Makefile
@@ -0,0 +1,24 @@
+ifndef EXE
+	EXE = $(CURDIR)/lc0
+endif
+
+BUILD_FLAGS =
+ifdef EVALFILE
+	BUILD_FLAGS += -Dembed=true
+endif
+ifdef SEARCH
+	BUILD_FLAGS += -Ddefault_search=$(SEARCH)
+endif
+
+POST_BUILD_COMMANDS =
+ifdef EVALFILE
+	POST_BUILD_COMMANDS = \
+		cat $(EVALFILE) >> $(EXE); \
+		perl -e "printf '%sLc0!', pack('V', -s '$(EVALFILE)')" >> $(EXE)
+endif
+
+.PHONY: all
+all:
+	chmod +x ../build.sh
+	../build.sh $(BUILD_FLAGS) && mv ../build/release/lc0 $(EXE)
+	$(POST_BUILD_COMMANDS)
\ No newline at end of file
diff --git a/README.md b/README.md
index a56da72740..0ce7a2a125 100644
--- a/README.md
+++ b/README.md
@@ -7,33 +7,28 @@ Lc0 is a UCI-compliant chess engine designed to play chess via neural network, s
 
 ## Downloading source
 
-Lc0 can be acquired either via a git clone or an archive download from GitHub. Be aware that there is a required submodule which isn't included in source archives.
+Lc0 can be acquired either via a git clone or an archive download from GitHub.
 
-For essentially all purposes, including selfplay game generation and match play, we highly recommend using the latest `release/version` branch (for example `release/0.31`), which is equivalent to using the latest version tag.
+For essentially all purposes, including selfplay game generation and match play, we highly recommend using the latest `release/version` branch (for example `release/0.32`), which is equivalent to using the latest version tag.
 
 Versioning follows the Semantic Versioning guidelines, with major, minor and patch sections. The training server enforces game quality using the versions output by the client and engine.
 
-
 Download using git:
 
 ```shell
-git clone -b release/0.31 --recurse-submodules https://github.com/LeelaChessZero/lc0.git
+git clone -b release/0.32 https://github.com/LeelaChessZero/lc0.git
 ```
 
 If you have cloned already an old version, fetch, view and checkout a new branch:
 ```shell
 git fetch --all
 git branch --all
-git checkout -t remotes/origin/release/0.31
+git checkout -t remotes/origin/release/0.32
 ```
 
-
-If you prefer to download an archive, you need to also download and place the submodule:
- * Download the [.zip](https://api.github.com/repos/LeelaChessZero/lc0/zipball/release/0.31) file ([.tar.gz](https://api.github.com/repos/LeelaChessZero/lc0/tarball/release/0.31) archive is also available)
+If you prefer to download an archive:
+ * Download the [.zip](https://api.github.com/repos/LeelaChessZero/lc0/zipball/release/0.32) file ([.tar.gz](https://api.github.com/repos/LeelaChessZero/lc0/tarball/release/0.32) archive is also available)
  * Extract
- * Download https://github.com/LeelaChessZero/lczero-common/archive/master.zip (also available as [.tar.gz](https://github.com/LeelaChessZero/lczero-common/archive/master.tar.gz))
- * Move the second archive into the first archive's `libs/lczero-common/` folder and extract
- * The final form should look like `<TOP>/libs/lczero-common/proto/`
 
 Having successfully acquired Lc0 via either of these methods, proceed to the build section below and follow the instructions for your OS.
 
@@ -42,13 +37,11 @@ Having successfully acquired Lc0 via either of these methods, proceed to the bui
 
 Building should be easier now than it was in the past. Please report any problems you have.
 
-Aside from the git submodule, lc0 requires the Meson build system and at least one backend library for evaluating the neural network, as well as the required `zlib`. (`gtest` is optionally used for the test suite.) If your system already has this library installed, they will be used; otherwise Meson will generate its own copy of the two (a "subproject"), which in turn requires that git is installed (yes, separately from cloning the actual lc0 repository). Meson also requires python and Ninja.
+Building lc0 requires the Meson build system and at least one backend library for evaluating the neural network, as well as a few libraries. If your system already has these libraries installed, they will be used; otherwise Meson will generate its own copy (a "subproject"), which in turn requires that git is installed (yes, separately from cloning the actual lc0 repository). Meson also requires python and Ninja.
 
-Backend support includes (in theory) any CBLAS-compatible library for CPU usage, such as OpenBLAS or Intel's DNNL or MKL. For GPUs, OpenCL and CUDA+cudnn are supported, while DX-12 can be used in Windows 10 with latest drivers.
+Backend support includes (in theory) any CBLAS-compatible library for CPU usage, but OpenBLAS or Intel's DNNL are the main ones. For GPUs, the following are supported: CUDA (with optional cuDNN), various flavors of onnxruntime, and Apple's Metal Performance Shaders. There is also experimental SYCL support for AMD and Intel GPUs.
 
-Finally, lc0 requires a compiler supporting C++17. Minimal versions seem to be g++ v8.0, clang v5.0 (with C++17 stdlib) or Visual Studio 2017.
-
-*Note* that cuda checks the compiler version and stops even with newer compilers, and to work around this we have added the `nvcc_ccbin` build option. This is more of an issue with new Linux versions, but you can get around it by using an earlier version of gcc just for cuda. As an example, adding `-Dnvcc_ccbin=g++-9` to the `build.sh` command line will use g++-9 with cuda instead of the system compiler.
+Finally, lc0 requires a compiler supporting C++20. Minimal versions tested are g++ v10.0, clang v12.0 and Visual Studio 2019 version 16.11.
 
 Given those basics, the OS and backend specific instructions are below.
 
@@ -56,160 +49,125 @@ Given those basics, the OS and backend specific instructions are below.
 
 #### Generic
 
-1. Install backend:
-    - If you want to use NVidia graphics cards Install [CUDA](https://developer.nvidia.com/cuda-zone) and [cuDNN](https://developer.nvidia.com/cudnn).
-    - If you want to use AMD graphics cards install OpenCL.
-    - if you want OpenBLAS version Install OpenBLAS (`libopenblas-dev`).
+1. Install backend (also read the detailed instructions in later sections):
+    - If you want to use NVidia graphics cards Install [CUDA](https://developer.nvidia.com/cuda-zone) (and optionally [cuDNN](https://developer.nvidia.com/cudnn)).
+    - If you want to use AMD or Intel graphics cards you can try SYCL.
+    - if you want BLAS install either OpenBLAS or DNNL.
 2. Install ninja build (`ninja-build`), meson, and (optionally) gtest (`libgtest-dev`).
 3. Go to `lc0/`
 4. Run `./build.sh`
 5. `lc0` will be in `lc0/build/release/` directory
-6. Unzip a [neural network](https://lczero.org/play/networks/bestnets/) in the same directory as the binary.
+6. Download a [neural network](https://lczero.org/play/networks/bestnets/) in the same directory as the binary (no need to unpack it).
 
 If you want to build with a different compiler, pass the `CC` and `CXX` environment variables:
+```shell
+CC=clang CXX=clang++ ./build.sh
+```
 
-    CC=clang-6.0 CXX=clang++-6.0 ./build.sh
-
-#### Note on installing CUDA on Ubuntu
-
-Nvidia provides .deb packages. CUDA will be installed in `/usr/local/cuda-10.0` and requires 3GB of diskspace.
-If your `/usr/local` partition doesn't have that much space left you can create a symbolic link before
-doing the install; for example: `sudo ln -s /opt/cuda-10.0 /usr/local/cuda-10.0`
-
-The instructions given on the nvidia website tell you to finish with `apt install cuda`. However, this
-might not work (missing dependencies). In that case use `apt install cuda-10-0`. Afterwards you can
-install the meta package `cuda` which will cause an automatic upgrade to a newer version when that
-comes available (assuming you use `Installer Type deb (network)`, if you'd want that (just cuda-10-0 will
-stay at version 10). If you don't know what to do, only install cuda-10-0.
-
-cuDNN exists of two packages, the Runtime Library and the Developer Library (both a .deb package).
+#### Ubuntu 20.04
 
-Before you can download the latter you need to create a (free) "developer" account with nvidia for
-which at least a legit email address is required (their website says: The e-mail address is not made public
-and will only be used if you wish to receive a new password or wish to receive certain news or notifications
-by e-mail.). Further they ask for a name, date of birth (not visible later on), country, organisation ("LeelaZero"
-if you have none), primary industry segment ("Other"/none) and which development areas you are interested
-in ("Deep Learning").
+For Ubuntu 20.04 you need meson, ninja and gcc-10 before performing the steps above. The following should work:
+```shell
+apt-get update
+apt-get -y install git python3-pip gcc-10 g++-10 zlib1g zlib1g-dev
+pip3 install meson
+pip3 install ninja
+CC=gcc-10 CXX=g++-10 INSTALL_PREFIX=~/.local ./build.sh
+```
 
-#### Ubuntu 18.04
+Make sure that `~/.local/bin` is in your `PATH` environment variable. You can now type `lc0 --help` and start.
 
-For Ubuntu 18.04 you need the latest version of meson, libstdc++-8-dev, and clang-6.0 before performing the steps above:
+### Windows
 
-    sudo apt-get install libstdc++-8-dev clang-6.0 ninja-build pkg-config
-    pip3 install meson --user
-    CC=clang-6.0 CXX=clang++-6.0 INSTALL_PREFIX=~/.local ./build.sh
+Here are the brief instructions for CUDA/cuDNN, for details and other options see `windows-build.md` and the instructions in the following sections.
 
-Make sure that `~/.local/bin` is in your `PATH` environment variable. You can now type `lc0 --help` and start.
+1. Install Microsoft Visual Studio (2019 version 16.11 or later)
+2. Install [CUDA](https://developer.nvidia.com/cuda-zone)
+3. (Optionally install [cuDNN](https://developer.nvidia.com/cudnn)).
+4. Install Python3 if you didn't install it with Visual Studio.
+5. Install Meson: `pip3 install --upgrade meson`
+6. If `CUDA_PATH` is not set (run the `set` command to see the full list of variables), edit `build.cmd` and set the `CUDA_PATH` with your CUDA directory
+* If you also want cuDNN, set `CUDNN_PATH` with your cuDNN directory (not needed if it is the same with `CUDA_PATH`).
 
-#### Ubuntu 16.04
+7. Run `build.cmd`. It will ask permission to delete the build directory, then generate MSVS project and pause.
 
-For Ubuntu 16.04 you need the latest version of meson, ninja, clang-6.0, and libstdc++-8:
+Then either:
 
-    wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
-    sudo apt-add-repository 'deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-6.0 main'
-    sudo add-apt-repository ppa:ubuntu-toolchain-r/test
-    sudo apt-get update
-    sudo apt-get install clang-6.0 libstdc++-8-dev
-    pip3 install meson ninja --user
-    CC=clang-6.0 CXX=clang++-6.0 INSTALL_PREFIX=~/.local ./build.sh
+8. Hit `Enter` to build it.
+9. Resulting binary will be `build/lc0.exe`
 
-Make sure that `~/.local/bin` is in your `PATH` environment variable. You can now type `lc0 --help` and start.
+Or.
 
-#### openSUSE (all versions)
+8. Open generated solution `build/lc0.sln` in Visual Studio and build it yourself.
 
-Instructions, packages and tools for building on openSUSE are at [openSUSE_install.md](openSUSE_install.md)
+### Mac
 
-#### Docker
+You will need xcode and python3 installed. Then you need to install some required packages through Terminal:
 
-Use https://github.com/vochicong/lc0-docker
-to run latest releases of lc0 and the client inside a Docker container.
+1. Install meson: `pip3 install meson`
+2. Install ninja: `pip3 install ninja`
 
+Now download the lc0 source, if you haven't already done so, following the instructions earlier in the page.
 
-### Windows
+3. Go to the lc0 directory.
+4. Run `./build.sh -Dgtest=false`
 
-Here are the brief instructions for CUDA/CuDNN, for details and other options see `windows-build.md`.
+The compiled Lc0 will be in `build/release` 
 
-0. Install Microsoft Visual Studio (2017 or later)
-1. Install [CUDA](https://developer.nvidia.com/cuda-zone)
-2. Install [cuDNN](https://developer.nvidia.com/cudnn).
-3. Install Python3
-4. Install Meson: `pip3 install --upgrade meson`
-5. Edit `build.cmd`:
+Starting with v0.32.0, we are also offering a pre-compiled version that can be downloaded from the [release page](https://github.com/LeelaChessZero/lc0/releases).
 
-* Set `CUDA_PATH` with your CUDA directory
-* Set `CUDNN_PATH` with your cuDNN directory (may be the same with CUDA_PATH)
+### CUDA
 
-6. Run `build.cmd`. It will ask permission to delete the build directory, then generate MSVS project and pause.
+CUDA can be downloaded and installed following the instructions in from <https://developer.nvidia.com/cuda-downloads>. The build in most cases will pick it up with no further action. However if the cuda compiler (`nvcc`) is not found you can call the build like this: `PATH=/usr/local/cuda/bin:$PATH ./build.sh`, replacing the path with the correct one for `nvcc`.
 
-Then either:
+*Note* that CUDA uses the system compiler and stops if it doesn't recognize the version, even if newer. This is more of an issue with new Linux versions, but you can get around with the `nvcc_ccbin` build option to specify a different compiler just for cuda. As an example, adding `-Dnvcc_ccbin=g++-11` to the build command line will use g++-11 with cuda instead of the system compiler.
 
-7. Hit `Enter` to build it.
-8. Resulting binary will be `build/lc0.exe`
+### ONNX
 
-Or.
+Lc0 offers several ONNX based backends, namely onnx-cpu, onnx-cuda, onnx-trt, onnx-rocm and on Windows onnx-dml, utilizing the execution providers offered by onnxruntime.
 
-7. Open generated solution `build/lc0.sln` in Visual Studio and build yourself.
+Some Linux systems are starting to offer onnxruntime packages, so after installing this there is a good chance the Lc0 build will pick it up with no further action required. Otherwise you can set the `onnx_libdir` and `onnx_include` build options to point to the onnxruntime libraries and include directories respectively. The same options are used if you unpack a package downloaded from <https://github.com/microsoft/onnxruntime/releases>.
 
-### Mac
+For Windows, we offer pre-compiled packages for onnx-dml and onnx-trt, see the included README for installation instructions.
 
-First you need to install some required packages through Terminal:
-1. Install brew as per the instructions at https://brew.sh/
-2. Install python3: `brew install python3`
-3. Install meson: `brew install meson`
-4. Install ninja: `brew install ninja`
-5. (For Mac OS 10.14 Mojave, or if the other step 5 fails):
- * Install developer tools: ``xcode-select --install``
- * When using Mojave install SDK headers: `installer -pkg /Library/Developer/CommandLineTools/Packages/macOS_SDK_headers_for_macOS_10.14.pkg -target /` (if this doesn't work, use `sudo installer` instead of just `installer`.)
+### SYCL
 
-Or.
+*Note* that SYCL support is new in v0.32.0 and as such is still considered experimental.
 
-5. (For MacOS 10.15 Catalina, or if the other step 5 fails): 
- * Install Xcode command-line tools: ``xcode-select --install``
- * Install "XCode Developer Tools" through the app store. (First one on the list of Apps if searched.)
- * Associate the SDK headers in XCode with a command: export CPATH=\`xcrun --show-sdk-path\`/usr/include
- 
-Now download the lc0 source, if you haven't already done so, following the instructions earlier in the page.
+You will need the Intel "oneAPI DPC++/C++ Compiler", "DPC++ Compatibility Tool" and (for an Intel GPU) "oneAPI Math Kernel Library (oneMKL)" or (for an AMD GPU) hipBLAS.
 
-6. Go to the lc0 directory.
-7. Run `./build.sh -Dgtest=false` (needs step 5)
+The Intel tools can be found in either the "oneAPI Base Toolkit" or "C++ Essentials" packages that can be downloaded from
+<https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html>, while hipBLAS can be downloaded from
+<https://rocm.docs.amd.com/projects/hipBLAS/en/latest/>
 
-### Raspberry Pi
+The compiler for C code is icx and for C++ code is icx on Windows but icpx on Linux.
 
-You'll need to be running the latest Raspberry Pi OS "buster".
+To build Lc0 with SYCL you need to set the `sycl` build option using `-Dsycl=l0` (that is el zero) for an Intel GPU or `-Dsycl=amd` for (you guessed it) an AMD GPU.
 
-1. Install OpenBLAS
+You may also have to set the `dpct_include` option to point to the DPC++ Compatibility Tool includes, the `onemkl_include` similarly for the oneMKL includes, or `hip_libdirs` and `hip_include` to the AMD HIP libraries and includes respectively.
 
+On Linux, a typical session would go like this:
 ```shell
-git clone https://github.com/xianyi/OpenBLAS.git
-cd OpenBLAS/
-make
-sudo make PREFIX=/usr install
-cd ..
+. /opt/intel/oneapi/setvars.sh --include-intel-llvm
+CC=icx CXX=icpx AR=llvm-ar ./build.sh release -Dgtest=false -Dsycl=l0
 ```
+The first line is to initialize the build environment and is only needed once per session, while the build line may need modification as described above.
 
-2. Install Meson
+On windows you will have to build using `ninja`, this is provided by Visual Studio if you install the CMake component. We provide a `build-sycl.cmd` script that should build just fine for an Intel GPU. This script has not yet been tested with and AMD GPU, some editing will be required.
 
-```shell
-pip install meson
-pip install ninja
-```
+You can also install the [oneAPI DPC++/C++ Compiler Runtime](https://www.intel.com/content/www/us/en/developer/articles/tool/compilers-redistributable-libraries-by-version.html) so you can run Lc0 without needing to initialize the build environment every time.
 
-3. Install compiler and standard libraries
+### BLAS
 
-```shell
-sudo apt install clang-6.0 libstdc++-8-dev
-```
+Lc0 can also run (a bit slow) on CPU, using matrix multiplication functions from a BLAS library. By default OpenBLAS is used if available as it seems to offer good performance on a wide range of processors. If your system doesn't offer an OpenBLAS package (e.g. `libopenblas-dev`), or you have a recent processor you can get DNNL from [here](<https://github.com/uxlfoundation/oneDNN/releases/v2.2>). To use DNNL you have to pass `-Ddnnl=true` to the build and specify the directory where it was installed using the `-Ddnnl_dir=` option. For macs, the Accelerate library will be used.
 
-4. Clone lc0 and compile
+If the "Intel Implicit SPMD Program Compiler" (`ispc`) is [installed](<https://ispc.github.io/downloads.html>), some performance critical functions will use vectorized code for faster execution. 
 
-```shell
-git clone https://github.com/LeelaChessZero/lc0.git
-cd lc0
-git submodule update --init --recursive
-CC=clang-6.0 CXX=clang++-6.0 ./build.sh -Ddefault_library=static
-```
+*Note* that Lc0 is not able to control the number of threads with all BLAS libraries. Some libraries try to exploit cores aggressively, in which case it may be best to leave the threads set to the default (i.e. automatic) setting.
+
+## Getting help
 
-5. The resulting binary will be in build/release
+If there is an issue or the above instructions were not clear, you can always ask for help. The fastest way is to ask in the help channel of our [discord chat](http://lc0.org/chat), but you can also open a [github issue](https://github.com/LeelaChessZero/lc0/issues) (after checking the issue hasn't already been reported).
 
 ## Python bindings
 
@@ -240,8 +198,8 @@ along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
 
 ### Additional permission under GNU GPL version 3 section 7
 
-_The source files of Lc0 with the exception of the BLAS and OpenCL
-backends (all files in the `blas` and `opencl` sub-directories) have
+_The source files of Lc0 with the exception of the BLAS, OpenCL and SYCL
+backends (all files in the `blas`, `opencl` and `sycl` sub-directories) have
 the following additional permission, as allowed under GNU GPL version 3
 section 7:_
 
diff --git a/appveyor.yml b/appveyor.yml
index 543b60f4bf..e68f9f136e 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -2,24 +2,28 @@ version: '{build}'
 configuration: Release
 platform: x64
 image:
-- Visual Studio 2017
+- Visual Studio 2019
 environment:
   matrix:
   - NAME: gpu-nvidia-cudnn
-  - NAME: gpu-nvidia-cuda
+  - NAME: gpu-nvidia-cuda12
 #  - NAME: gpu-dx12
 #  - NAME: gpu-opencl
   - NAME: cpu-dnnl
   - NAME: cpu-openblas
 #  - NAME: onednn
-  - NAME: onnx-dml
+  - NAME: onnx
   - NAME: android
+  - NAME: gpu-nvidia-cuda11
 for:
 -
   matrix:
     only:
+    - NAME: gpu-nvidia-cudnn
+    - NAME: gpu-nvidia-cuda11
 #    - NAME: gpu-opencl
     - NAME: cpu-dnnl
+    - NAME: cpu-openblas
   skip_non_tags: true
 clone_folder: c:\projects\lc0
 install:
@@ -29,34 +33,38 @@ install:
 - cmd: set OPENCL=false
 - cmd: set BLAS=false
 - cmd: set ONEDNN=false
-- cmd: set ONNX_DML=false
+- cmd: set ONNX=false
 - cmd: set GTEST=false
 - cmd: set ANDROID=false
 - cmd: IF %NAME%==android set ANDROID=true
 - cmd: IF %NAME%==gpu-nvidia-cudnn set CUDNN=true
 - cmd: IF %NAME%==gpu-nvidia-cudnn set CUDA=true
-- cmd: IF %NAME%==gpu-nvidia-cuda set CUDA=true
+- cmd: IF %NAME%==gpu-nvidia-cuda11 set CUDA=true
+- cmd: IF %NAME%==gpu-nvidia-cuda12 set CUDA=true
 - cmd: IF %NAME%==gpu-dx12 set DX=true
 - cmd: IF %NAME%==gpu-opencl set OPENCL=true
 - cmd: IF %NAME%==cpu-dnnl set BLAS=true
 - cmd: IF %NAME%==cpu-openblas set BLAS=true
-- cmd: IF %NAME%==cpu-openblas set GTEST=true
 - cmd: IF %NAME%==onednn set ONEDNN=true
-- cmd: IF %NAME%==onnx-dml set ONNX_DML=true
+- cmd: IF %NAME%==onnx set ONNX=true
+- cmd: IF %NAME%==onnx set GTEST=true
 - cmd: set NET=753723
 - cmd: set NET_HASH=3e3444370b9fe413244fdc79671a490e19b93d3cca1669710ffeac890493d198
 - cmd: IF NOT %OPENCL%==true IF NOT %DX%==true set NET=791556
 - cmd: IF NOT %OPENCL%==true IF NOT %DX%==true set NET_HASH=f404e156ceb2882470fd8c032b8754af0fa0b71168328912eaef14671a256e34
-- cmd: call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" amd64
+#- cmd: call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvarsall.bat" amd64
 - cmd: set DNNL_NAME=dnnl_win_1.5.0_cpu_vcomp
 - cmd: IF %NAME%==cpu-dnnl IF NOT EXIST C:\cache\%DNNL_NAME% appveyor DownloadFile https://github.com/oneapi-src/oneDNN/releases/download/v1.5/dnnl_win_1.5.0_cpu_vcomp.zip
 - cmd: IF %NAME%==cpu-dnnl IF NOT EXIST C:\cache\%DNNL_NAME% 7z x dnnl_win_1.5.0_cpu_vcomp.zip -oC:\cache
 - cmd: IF %NAME%==onednn set DNNL_NAME=dnnl_win_2.7.2_cpu_vcomp_gpu_vcomp
 - cmd: IF %NAME%==onednn IF NOT EXIST C:\cache\%DNNL_NAME% appveyor DownloadFile https://github.com/borg323/oneDNN/releases/download/v2.7.2/dnnl_win_2.7.2_cpu_vcomp_gpu_vcomp.zip
 - cmd: IF %NAME%==onednn IF NOT EXIST C:\cache\%DNNL_NAME% 7z x dnnl_win_2.7.2_cpu_vcomp_gpu_vcomp.zip -oC:\cache
-- cmd: IF %NAME%==onnx-dml IF NOT EXIST C:\cache\onnxruntime-win-x64-dml-1.13.1 appveyor DownloadFile https://github.com/borg323/onnxruntime/releases/download/v1.13.1/onnxruntime-win-x64-dml-1.13.1.zip
-- cmd: IF %NAME%==onnx-dml IF NOT EXIST C:\cache\onnxruntime-win-x64-dml-1.13.1 7z x onnxruntime-win-x64-dml-1.13.1.zip -oC:\cache
-- cmd: IF %NAME%==onnx-dml set ONNX_NAME=onnxruntime-win-x64-dml-1.13.1
+- cmd: IF %NAME%==onnx set ONNX_NAME=onnxruntime-win-x64-dml-1.22.1
+- cmd: IF %NAME%==onnx set ONNX_NAME_TWO=onnxruntime-win-x64-gpu-1.22.1
+- cmd: IF %NAME%==onnx IF NOT EXIST C:\cache\%ONNX_NAME% appveyor DownloadFile https://github.com/microsoft/onnxruntime/releases/download/v1.22.1/Microsoft.ML.OnnxRuntime.DirectML.1.22.1.nupkg
+- cmd: IF %NAME%==onnx IF NOT EXIST C:\cache\%ONNX_NAME% 7z x Microsoft.ML.OnnxRuntime.DirectML.1.22.1.nupkg -oC:\cache\%ONNX_NAME%
+- cmd: IF %NAME%==onnx IF NOT EXIST C:\cache\%ONNX_NAME_TWO% appveyor DownloadFile https://github.com/microsoft/onnxruntime/releases/download/v1.22.1/onnxruntime-win-x64-gpu-1.22.1.zip
+- cmd: IF %NAME%==onnx IF NOT EXIST C:\cache\%ONNX_NAME_TWO% 7z x onnxruntime-win-x64-gpu-1.22.1.zip -oC:\cache
 - cmd: IF %NAME%==cpu-openblas IF NOT EXIST C:\cache\OpenBLAS appveyor DownloadFile https://sjeng.org/ftp/OpenBLAS-0.3.3-win-oldthread.zip
 - cmd: IF %NAME%==cpu-openblas IF NOT EXIST C:\cache\OpenBLAS 7z x OpenBLAS-0.3.3-win-oldthread.zip -oC:\cache\OpenBLAS
 - cmd: IF %OPENCL%==true nuget install opencl-nug -Version 0.777.77 -OutputDirectory C:\cache
@@ -65,26 +73,36 @@ install:
 - cmd: IF %ISPC%==true IF NOT EXIST C:\cache\ispc-v1.13.0-windows appveyor DownloadFile https://github.com/ispc/ispc/releases/download/v1.13.0/ispc-v1.13.0-windows.zip
 - cmd: IF %ISPC%==true IF NOT EXIST C:\cache\ispc-v1.13.0-windows 7z x ispc-v1.13.0-windows.zip -oC:\cache\ispc-v1.13.0-windows
 - cmd: IF %ISPC%==true set PATH=C:\cache\ispc-v1.13.0-windows\bin;%PATH%
-- cmd: set "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0"
+- cmd: set "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1"
 - cmd: IF %CUDNN%==true IF NOT EXIST "%CUDA_PATH%\cuda" set CUDNN_INSTALL=1
-- cmd: IF DEFINED CUDNN_INSTALL appveyor DownloadFile https://developer.nvidia.com/compute/cuda/10.0/Prod/network_installers/cuda_10.0.130_win10_network
-- cmd: IF DEFINED CUDNN_INSTALL cuda_10.0.130_win10_network -s nvcc_10.0 cublas_dev_10.0 cublas_10.0 cudart_10.0
-- cmd: IF DEFINED CUDNN_INSTALL appveyor DownloadFile http://developer.download.nvidia.com/compute/redist/cudnn/v7.4.2/cudnn-10.0-windows10-x64-v7.4.2.24.zip
-- cmd: IF DEFINED CUDNN_INSTALL 7z x cudnn-10.0-windows10-x64-v7.4.2.24.zip -o"%CUDA_PATH%"
-- cmd: IF %CUDNN%==false set "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1"
+- cmd: IF DEFINED CUDNN_INSTALL appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/10.1/Prod/network_installers/cuda_10.1.243_win10_network.exe
+- cmd: IF DEFINED CUDNN_INSTALL cuda_10.1.243_win10_network -s nvcc_10.1 cublas_dev_10.1 cublas_10.1 cudart_10.1
+- cmd: IF DEFINED CUDNN_INSTALL appveyor DownloadFile https://developer.download.nvidia.com/compute/redist/cudnn/v7.5.1/cudnn-10.1-windows10-x64-v7.5.1.10.zip
+- cmd: IF DEFINED CUDNN_INSTALL 7z x cudnn-10.1-windows10-x64-v7.5.1.10.zip -o"%CUDA_PATH%"
+- cmd: IF %NAME%==gpu-nvidia-cuda11 set "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1"
+- cmd: IF %NAME%==gpu-nvidia-cuda12 set "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9"
+- cmd: IF %NAME%==onnx set "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9"
 - cmd: IF %CUDA%==true IF NOT EXIST "%CUDA_PATH%" set CUDA_INSTALL=1
-- cmd: IF DEFINED CUDA_INSTALL appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/11.1.0/network_installers/cuda_11.1.0_win10_network.exe
-- cmd: IF DEFINED CUDA_INSTALL cuda_11.1.0_win10_network.exe -s nvcc_11.1 cublas_dev_11.1 cublas_11.1 cudart_11.1 documentation_11.1
+- cmd: IF %ONNX%==true IF NOT EXIST "%CUDA_PATH%" set CUDA_INSTALL=1
+- cmd: IF DEFINED CUDA_INSTALL IF %NAME%==gpu-nvidia-cuda11 appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/11.1.0/network_installers/cuda_11.1.0_win10_network.exe
+- cmd: IF DEFINED CUDA_INSTALL IF %NAME%==gpu-nvidia-cuda11 cuda_11.1.0_win10_network.exe -s nvcc_11.1 cublas_dev_11.1 cublas_11.1 cudart_11.1 documentation_11.1
+- cmd: IF DEFINED CUDA_INSTALL IF %NAME%==gpu-nvidia-cuda12 appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/12.9.0/network_installers/cuda_12.9.0_windows_network.exe
+- cmd: IF DEFINED CUDA_INSTALL IF %NAME%==gpu-nvidia-cuda12 cuda_12.9.0_windows_network.exe -s nvcc_12.9 cublas_dev_12.9 cublas_12.9 curand_dev_12.9 cudart_12.9 documentation_12.9
+- cmd: IF %NAME%==gpu-nvidia-cuda12 IF NOT EXIST C:\cache\cutlass-2.11.0 appveyor DownloadFile https://github.com/NVIDIA/cutlass/archive/refs/tags/v2.11.0.zip
+- cmd: IF %NAME%==gpu-nvidia-cuda12 IF NOT EXIST C:\cache\cutlass-2.11.0 7z x v2.11.0.zip -oC:\cache\
+- cmd: IF DEFINED CUDA_INSTALL IF %NAME%==onnx appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/12.9.0/network_installers/cuda_12.9.0_windows_network.exe
+- cmd: IF DEFINED CUDA_INSTALL IF %NAME%==onnx cuda_12.9.0_windows_network.exe -s nvcc_12.9 cudart_12.9
 - cmd: IF %CUDA%==true set PATH=%CUDA_PATH%\bin;%PATH%
-- cmd: set PATH=C:\Python36;C:\Python36\scripts;%PATH%
-- cmd: pip3 install --upgrade meson==0.55.3
-- cmd: set MIMALLOC_PATH=C:\cache\mimalloc-1.7.1
-- cmd: IF %ANDROID%==false IF NOT EXIST "%MIMALLOC_PATH%" appveyor DownloadFile https://github.com/microsoft/mimalloc/archive/refs/tags/v1.7.1.zip
-- cmd: IF %ANDROID%==false IF NOT EXIST "%MIMALLOC_PATH%" 7z x v1.7.1.zip -oC:\cache\
-- cmd: IF %ANDROID%==false IF NOT EXIST "%MIMALLOC_PATH%"\out msbuild "%MIMALLOC_PATH%"\ide\vs2017\mimalloc-override.vcxproj /p:Configuration=Release /m
-- cmd: IF %NAME%==android IF NOT EXIST C:\ndk\android-ndk-r19c\toolchains\llvm\prebuilt\windows-x86_64 appveyor DownloadFile https://dl.google.com/android/repository/android-ndk-r19c-windows-x86_64.zip
-- cmd: IF %NAME%==android IF NOT EXIST C:\ndk\android-ndk-r19c\toolchains\llvm\prebuilt\windows-x86_64 7z x android-ndk-r19c-windows-x86_64.zip -oC:\ndk
-- cmd: IF %NAME%==android set PATH=C:\ndk\android-ndk-r19c\toolchains\llvm\prebuilt\windows-x86_64\bin;%PATH%
+- cmd: IF %ONNX%==true set PATH=%CUDA_PATH%\bin;%PATH%
+- cmd: set PATH=C:\Python310;C:\Python310\scripts;%PATH%
+#- cmd: pip3 install --upgrade meson==0.55.3
+- cmd: set MIMALLOC_PATH=C:\cache\mimalloc-1.8.7
+- cmd: IF %ANDROID%==false IF NOT EXIST "%MIMALLOC_PATH%" appveyor DownloadFile https://github.com/microsoft/mimalloc/archive/refs/tags/v1.8.7.zip
+- cmd: IF %ANDROID%==false IF NOT EXIST "%MIMALLOC_PATH%" 7z x v1.8.7.zip -oC:\cache\
+- cmd: IF %ANDROID%==false IF NOT EXIST "%MIMALLOC_PATH%"\out msbuild "%MIMALLOC_PATH%"\ide\vs2019\mimalloc-override.vcxproj /p:Configuration=Release /m
+- cmd: IF %NAME%==android IF NOT EXIST C:\ndk\android-ndk-r27c\toolchains\llvm\prebuilt\windows-x86_64 appveyor DownloadFile https://dl.google.com/android/repository/android-ndk-r27c-windows.zip
+- cmd: IF %NAME%==android IF NOT EXIST C:\ndk\android-ndk-r27c\toolchains\llvm\prebuilt\windows-x86_64 7z x android-ndk-r27c-windows.zip -oC:\ndk
+- cmd: IF %NAME%==android set PATH=C:\ndk\android-ndk-r27c\toolchains\llvm\prebuilt\windows-x86_64\bin;%PATH%
 - cmd: IF %NAME%==android sed "s/clang+*/&.cmd/" cross-files/aarch64-linux-android >crossfile-aarch64
 - cmd: IF %NAME%==android IF NOT EXIST C:\cache\OpenBLAS\android-aarch64 appveyor DownloadFile https://github.com/borg323/OpenBLAS/releases/download/android-0.3.27/openblas-android-aarch64.zip
 - cmd: IF %NAME%==android IF NOT EXIST C:\cache\OpenBLAS\android-aarch64 7z x openblas-android-aarch64.zip -oC:\cache\OpenBLAS
@@ -97,18 +115,21 @@ install:
 - cmd: touch -t 201801010000.00 c:\cache\%NET%.pb.gz
 - cmd: IF %GTEST%==true IF NOT EXIST C:\cache\syzygy mkdir C:\cache\syzygy
 - cmd: IF %GTEST%==true cd C:\cache\syzygy
-- cmd: IF %GTEST%==true IF NOT EXIST KQvK.rtbz curl --remote-name-all https://tablebase.lichess.ovh/tables/standard/3-4-5/K{P,N,R,B,Q}vK.rtb{w,z}
-- cmd: IF %GTEST%==true IF NOT EXIST KQQvK.rtbz curl --remote-name-all https://tablebase.lichess.ovh/tables/standard/3-4-5/K{P,N,R,B,Q}{P,N,R,B,Q}vK.rtb{w,z}
-- cmd: IF %GTEST%==true IF NOT EXIST KQvKQ.rtbz curl --remote-name-all https://tablebase.lichess.ovh/tables/standard/3-4-5/K{P,N,R,B,Q}vK{P,N,R,B,Q}.rtb{w,z}
+- cmd: IF %GTEST%==true IF NOT EXIST KQvK.rtbz curl --remote-name-all https://tablebase.lichess.ovh/tables/standard/3-4-5-dtz/K{P,N,R,B,Q}vK.rtbz
+- cmd: IF %GTEST%==true IF NOT EXIST KQQvK.rtbz curl --remote-name-all https://tablebase.lichess.ovh/tables/standard/3-4-5-dtz/K{P,N,R,B,Q}{P,N,R,B,Q}vK.rtbz
+- cmd: IF %GTEST%==true IF NOT EXIST KQvKQ.rtbz curl --remote-name-all https://tablebase.lichess.ovh/tables/standard/3-4-5-dtz/K{P,N,R,B,Q}vK{P,N,R,B,Q}.rtbz
+- cmd: IF %GTEST%==true IF NOT EXIST KQvK.rtbw curl --remote-name-all https://tablebase.lichess.ovh/tables/standard/3-4-5-wdl/K{P,N,R,B,Q}vK.rtbw
+- cmd: IF %GTEST%==true IF NOT EXIST KQQvK.rtbw curl --remote-name-all https://tablebase.lichess.ovh/tables/standard/3-4-5-wdl/K{P,N,R,B,Q}{P,N,R,B,Q}vK.rtbw
+- cmd: IF %GTEST%==true IF NOT EXIST KQvKQ.rtbw curl --remote-name-all https://tablebase.lichess.ovh/tables/standard/3-4-5-wdl/K{P,N,R,B,Q}vK{P,N,R,B,Q}.rtbw
 - cmd: cd C:\projects\lc0
 cache:
   - C:\cache
-  - 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0'
+  - 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1'
   - 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1'
+  - 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9 -> appveyor.yml'
   - C:\projects\lc0\subprojects\packagecache
-  - C:\ndk\android-ndk-r19c\toolchains\llvm\prebuilt\windows-x86_64
+  - C:\ndk\android-ndk-r27c\toolchains\llvm\prebuilt\windows-x86_64
 before_build:
-- cmd: git submodule update --init --recursive
 - cmd: IF %BLAS%==true (echo.#define DEFAULT_MAX_PREFETCH 0 & echo.#define DEFAULT_TASK_WORKERS 0) > params_override.h
 - cmd: IF %ANDROID%==true (echo.#define DEFAULT_MAX_PREFETCH 0 & echo.#define DEFAULT_TASK_WORKERS 0) > params_override.h
 - cmd: SET BUILD_BLAS=%BLAS%
@@ -123,8 +144,9 @@ before_build:
 - cmd: IF %CUDA%==true SET F16C=false
 - cmd: SET EXTRA=
 - cmd: IF %ANDROID%==false SET EXTRA=-Db_vscrt=md
-- cmd: IF %ONNX_DML%==true SET EXTRA=-Db_vscrt=md -Donnx_libdir=C:\cache\%ONNX_NAME%\lib -Donnx_include=C:\cache\%ONNX_NAME%\include
-- cmd: IF %ANDROID%==false meson build --backend vs2017 --buildtype release -Dgtest=%GTEST% -Dopencl=%OPENCL% -Dblas=%BUILD_BLAS% -Ddnnl=true -Ddx=%DX% -Dcudnn=%CUDNN% -Donednn=%ONEDNN% -Dispc_native_only=false -Dnative_cuda=false -Dpopcnt=%POPCNT% -Df16c=%F16C% -Dcudnn_include="%CUDA_PATH%\include","%CUDA_PATH%\cuda\include" -Dcudnn_libdirs="%CUDA_PATH%\lib\x64","%CUDA_PATH%\cuda\lib\x64" -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\dist64\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\dist64\lib" -Ddnnl_dir="%PKG_FOLDER%\%DNNL_NAME%" -Dopencl_include="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\include" -Dopencl_libdirs="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\lib\x64" -Ddefault_library=static -Dmalloc=mimalloc -Dmimalloc_libdir="%MIMALLOC_PATH%"\out\msvc-x64\Release %EXTRA%
+- cmd: IF %ONNX%==true SET EXTRA=-Db_vscrt=md -Donnx_libdir=C:\cache\%ONNX_NAME%\runtimes\win-x64\native\ -Donnx_include=C:\cache\%ONNX_NAME%\build\native\include -Ddefault_backend=onnx-trt -Dplain_cuda=false
+- cmd: IF %NAME%==gpu-nvidia-cuda12 SET EXTRA=-Db_vscrt=md -Dcutlass=true -Dcutlass_include=C:\cache\cutlass-2.11.0\include
+- cmd: IF %ANDROID%==false meson build --backend vs2019 --buildtype release -Dgtest=false -Dopencl=%OPENCL% -Dblas=%BUILD_BLAS% -Ddnnl=true -Ddx=%DX% -Dcudnn=%CUDNN% -Donednn=%ONEDNN% -Dispc_native_only=false -Dnative_cuda=false -Dpopcnt=%POPCNT% -Df16c=%F16C% -Dcudnn_include="%CUDA_PATH%\include","%CUDA_PATH%\cuda\include" -Dcudnn_libdirs="%CUDA_PATH%\lib\x64","%CUDA_PATH%\cuda\lib\x64" -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\dist64\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\dist64\lib" -Ddnnl_dir="%PKG_FOLDER%\%DNNL_NAME%" -Dopencl_include="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\include" -Dopencl_libdirs="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\lib\x64" -Ddefault_library=static -Dmalloc=mimalloc -Dmimalloc_libdir="%MIMALLOC_PATH%"\out\msvc-x64\Release %EXTRA%
 - cmd: IF %ANDROID%==true meson arm64-v8a --buildtype release -Dgtest=false -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\android-aarch64\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\android-aarch64\lib" -Dembed=%EMBED% -Ddefault_library=static --cross-file crossfile-aarch64
 - cmd: IF %ANDROID%==true meson armeabi-v7a --buildtype release -Dgtest=false -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\android-armv7a\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\android-armv7a\lib" -Dembed=%EMBED% -Ddefault_library=static --cross-file crossfile-armv7a -Dispc=false -Dneon=false
 build_script:
@@ -136,7 +158,7 @@ after_build:
 - cmd: IF %APPVEYOR_REPO_TAG%==true IF %ANDROID%==true call scripts\appveyor_android_package.cmd
 - cmd: cd C:\projects\lc0
 artifacts:
-  - path: build/lc0.exe
+  - path: /build/lc0*.exe/
     name: lc0-$(NAME)
   - path: arm64-v8a/lc0
     name: lc0-android-arm64-v8a
@@ -166,6 +188,7 @@ deploy:
 test_script:
 - cmd: IF %GTEST%==true cd build
 - cmd: IF %GTEST%==true xcopy /s /i C:\cache\syzygy syzygy
+- cmd: IF %GTEST%==true IF %ONNX%==true copy %PKG_FOLDER%\%ONNX_NAME%\runtimes\win-x64\native\onnxruntime.dll
 - cmd: IF %GTEST%==true meson test --print-errorlogs
 - cmd: cd C:\projects\lc0
 on_finish:
diff --git a/build-sycl.cmd b/build-sycl.cmd
new file mode 100644
index 0000000000..7f4d626d77
--- /dev/null
+++ b/build-sycl.cmd
@@ -0,0 +1,61 @@
+@echo off
+setlocal
+
+rem 1. Set the following for the options you want to build.
+rem SYCL can be off, l0, amd or nvidia.
+set SYCL=l0
+set CUDNN=true
+set CUDA=true
+set DX12=false
+set OPENCL=false
+set MKL=false
+set DNNL=true
+set OPENBLAS=false
+set EIGEN=false
+set TEST=false
+
+rem 2. Edit the paths for the build dependencies.
+set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0
+set CUDNN_PATH=%CUDA_PATH%
+set OPENBLAS_PATH=C:\OpenBLAS
+set MKL_PATH=C:\Program Files (x86)\Intel\oneAPI\mkl\latest\
+set DNNL_PATH=C:\Program Files (x86)\Intel\oneAPI\dnnl\latest\cpu_iomp
+set OPENCL_LIB_PATH=%CUDA_PATH%\lib\x64
+set OPENCL_INCLUDE_PATH=%CUDA_PATH%\include
+
+rem 3. In most cases you won't need to change anything further down.
+echo Deleting build directory:
+rd /s build
+
+rem Use cl for C files to get a resource compiler as needed for zlib.
+set CC=cl
+set CXX=icx
+
+set BLAS=true
+if %MKL%==false if %DNNL%==false if %OPENBLAS%==false if %EIGEN%==false set BLAS=false
+
+if "%CUDA_PATH%"=="%CUDNN_PATH%" (
+  set CUDNN_LIB_PATH=%CUDNN_PATH%\lib\x64
+  set CUDNN_INCLUDE_PATH=%CUDNN_PATH%\include
+) else (
+  set CUDNN_LIB_PATH=%CUDA_PATH%\lib\x64,%CUDNN_PATH%\lib\x64
+  set CUDNN_INCLUDE_PATH=%CUDA_PATH%\include,%CUDNN_PATH%\include
+)
+
+if %CUDNN%==true set PATH=%CUDA_PATH%\bin;%PATH%
+
+meson setup build --buildtype release -Ddx=%DX12% -Dcudnn=%CUDNN% -Dplain_cuda=%CUDA% ^
+-Dopencl=%OPENCL% -Dblas=%BLAS% -Dmkl=%MKL% -Dopenblas=%OPENBLAS% -Ddnnl=%DNNL% -Dgtest=%TEST% ^
+-Dcudnn_include="%CUDNN_INCLUDE_PATH%" -Dcudnn_libdirs="%CUDNN_LIB_PATH%" ^
+-Dmkl_include="%MKL_PATH%\include" -Dmkl_libdirs="%MKL_PATH%\lib\intel64" -Ddnnl_dir="%DNNL_PATH%" ^
+-Dopencl_libdirs="%OPENCL_LIB_PATH%" -Dopencl_include="%OPENCL_INCLUDE_PATH%" ^
+-Dopenblas_include="%OPENBLAS_PATH%\include" -Dopenblas_libdirs="%OPENBLAS_PATH%\lib" ^
+-Ddefault_library=static -Dsycl=%SYCL% -Db_vscrt=md
+
+if errorlevel 1 exit /b
+
+pause
+
+cd build
+
+ninja
\ No newline at end of file
diff --git a/build.cmd b/build.cmd
index 9646029711..262b5ee00e 100644
--- a/build.cmd
+++ b/build.cmd
@@ -2,7 +2,7 @@
 setlocal
 
 rem 1. Set the following for the options you want to build.
-set CUDNN=true
+set CUDNN=false
 set CUDA=true
 set DX12=false
 set OPENCL=false
@@ -11,15 +11,24 @@ set DNNL=false
 set OPENBLAS=false
 set EIGEN=false
 set TEST=false
+set CUTLASS=true
+
+if "%CUDA%"=="true" (
+  if not defined CUDA_PATH (
+    echo WARNING: CUDA_PATH environment variable not found. Using default value.
+  ) else (
+    echo CUDA_PATH found in system environment: "%CUDA_PATH%"
+  )
+)
 
 rem 2. Edit the paths for the build dependencies.
-set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0
+if not defined CUDA_PATH set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9
 set CUDNN_PATH=%CUDA_PATH%
+set OPENCL_LIB_PATH=%CUDA_PATH%\lib\x64
+set OPENCL_INCLUDE_PATH=%CUDA_PATH%\include
 set OPENBLAS_PATH=C:\OpenBLAS
 set MKL_PATH=C:\Program Files (x86)\IntelSWTools\compilers_and_libraries\windows\mkl
 set DNNL_PATH=C:\dnnl_win_1.1.1_cpu_vcomp
-set OPENCL_LIB_PATH=%CUDA_PATH%\lib\x64
-set OPENCL_INCLUDE_PATH=%CUDA_PATH%\include
 
 rem 3. In most cases you won't need to change anything further down.
 echo Deleting build directory:
@@ -63,6 +72,7 @@ meson setup build --backend %backend% --buildtype release -Ddx=%DX12% -Dcudnn=%C
 -Dmkl_include="%MKL_PATH%\include" -Dmkl_libdirs="%MKL_PATH%\lib\intel64" -Ddnnl_dir="%DNNL_PATH%" ^
 -Dopencl_libdirs="%OPENCL_LIB_PATH%" -Dopencl_include="%OPENCL_INCLUDE_PATH%" ^
 -Dopenblas_include="%OPENBLAS_PATH%\include" -Dopenblas_libdirs="%OPENBLAS_PATH%\lib" ^
+-Dcutlass="%CUTLASS%" ^
 -Ddefault_library=static
 
 if errorlevel 1 exit /b
diff --git a/build.sh b/build.sh
index fa30e5c3df..8eb935c926 100755
--- a/build.sh
+++ b/build.sh
@@ -24,7 +24,7 @@ if [ -f "${BUILDDIR}/build.ninja" ]
 then
   "${MESON}" configure "${BUILDDIR}" -Dbuildtype="${BUILDTYPE}" -Dprefix="${INSTALL_PREFIX:-/usr/local}" "$@"
 else
-  "${MESON}" "${BUILDDIR}" --buildtype "${BUILDTYPE}" --prefix "${INSTALL_PREFIX:-/usr/local}" "$@"
+  "${MESON}" setup "${BUILDDIR}" --buildtype "${BUILDTYPE}" --prefix "${INSTALL_PREFIX:-/usr/local}" "$@"
 fi
 
 "${MESON}" compile -C "${BUILDDIR}"
diff --git a/build_rescorer.cmd b/build_rescorer.cmd
index ee20db0d58..9d3897c543 100644
--- a/build_rescorer.cmd
+++ b/build_rescorer.cmd
@@ -23,7 +23,7 @@ if exist "C:\Program Files\Microsoft Visual Studio\2022" (
   set backend=vs2017
 )
 
-meson build --backend %backend% --buildtype release -Drescorer=true -Dlc0=false -Dgtest=false -Ddefault_library=static
+meson setup build --backend %backend% --buildtype release -Drescorer=true -Dlc0=false -Dgtest=false -Ddefault_library=static
 
 if errorlevel 1 exit /b
 
@@ -32,4 +32,4 @@ pause
 cd build
 
 msbuild /m /p:Configuration=Release /p:Platform=x64 /p:WholeProgramOptimization=true ^
-/p:PreferredToolArchitecture=x64 rescorer.sln /filelogger
+/p:PreferredToolArchitecture=x64 lc0.sln /filelogger
diff --git a/changelog.txt b/changelog.txt
index 5d208674ac..cdfec68116 100644
--- a/changelog.txt
+++ b/changelog.txt
@@ -1,4 +1,101 @@
-﻿v0.31.0-rc1 (2024-03-25)
+﻿v0.32.0 (2025-08-21)
+~~~~~~~
+* Support for building with cuda 13.
+* README update.
+* Build system improvements.
+
+v0.32.0-rc2 (2025-08-12)
+~~~~~~~
+* Fix for onnx-trt bug, where the wrong network could be used from the cache.
+* Added code to detect RPE nets and give an error instead of bad results.
+* Better instructions in the readme and install script for onnx-trt.
+* Made `UCI_ShowWDL` again off by default again as some GUIs have issues.
+* Fixed a long standing issue when compiled with `-ffast-math` (or `icx -O3`).
+* Several improvements to the sycl backend.
+* Several improvements to the metal backend.
+* Refactored the rescorer code and training data header to make them usable by
+  external tools.
+* Relaxed cuda/cudnn version checks so that no warnings are shown for mismatched
+  versions that are supported.
+* Several build system updates.
+* Assorted small fixes and improvements.
+
+v0.32.0-rc1 (2025-07-18)
+~~~~~~~
+The code has been reorganized and undergone major changes. Therefore this
+changelog will be less detailed and describe the changes in major groups.
+* We have a new search API that allows search algorithms to co-exist. Currently
+  available are `classic` (the default), `dag-preview` (more later),
+  `valuehead` and `policyhead`. The default algorithm can be changed either at
+  build time by the `default_search` option or by renaming the executable to
+  include the algorithm name (e.g. lc0-valuehead).
+* We also have a new backend interface that is chess oriented and not tied to
+  the network architecture. The existing backends still use the old interface
+  through a wrapper.
+* The source code is reorganized, with a more logical directory structure.
+* The original search was ported to the new search and backend interfaces and
+  is renamed to `classic`. This has allowed some streamlining and
+  simplifications.
+* The `dag-preview` search is the DAG algorithm that lived in a separate branch
+  up to now. It hasn't been so well tested, that's why it has "preview" in its
+  name for now, but lives in the `src/search/dag-classic` directory.
+* The `valuehead` search replaces `ValueOnly` mode and selects the move with the
+  best value head evaluation.
+* The `policyhead` search is equivalent to a single node search, selecting the
+  best move using just the policy head.
+* The new `default_backend` build option allows to override the fixed priority
+  for the backend used by default.
+* The new `native_arch` build option to override the `-march=native` compiler
+  default for linux release builds, to help with distribution package creation.
+* We have a new `sycl` backend that will work with amd, intel and nvidia gpus.
+* There is also a new `onnx-trt` backend, using tensorrt on nvidia gpus.
+* Support simple/normal/pro mode in options was cleaned up, using a common
+  mechanism.
+* Added the `wait` uci extension command to allow running simple tests from the
+  command line.
+* Removed the `fen` uci extension command as it was unnecessarily complicating
+  things.
+* Some preliminary fp8 support was added for onnx and xla. This is not
+  functional, just there to make experimentation easier.
+* Several build system changes and improvements.
+* We now generate binaries for cuda 12, onnx-trt and macos.
+* Support for using lc0 with openbench.
+* New `bench` mode for a quicker benchmark.
+* Assorted small fixes and improvements.
+
+v0.31.2 (2024-10-20)
+~~~~~~~
+* Updated the WDL_mu centipawn fallback.
+* Fix for build issues with newer Linux c++ libraries.
+* Fix for an XLA Mish bug.
+* Minor README.md update.
+
+v0.31.1 (2024-08-11)
+~~~~~~~
+* Make WDL_mu score type work as intended.
+* Fix macos CI builds.
+
+v0.31.0 (2024-06-16)
+~~~~~~~
+* No changes from rc3.
+
+v0.31.0-rc3 (2024-05-29)
+~~~~~~~
+* The `WDLDrawRateTarget` option now accepts the value 0 (new default) to retain
+  raw WDL values if `WDLCalibrationElo` is set to 0 (default).
+* Improvements to the verbose move stats if `WDLEvalObjectivity` is used.
+* The centipawn score is displayed by default for old nets without WDL output.
+* Some build system improvements.
+
+v0.31.0-rc2 (2024-04-16)
+~~~~~~~
+* Changed cuda compilation options to use `-arch=native` or `-arch=all-major`
+  if no specific version is requested, with fallback for older cuda that don't
+  support those options.
+* Updated android builds to use openblas 0.3.27.
+* A few small fixes.
+
+v0.31.0-rc1 (2024-03-25)
 ~~~~~~~
 * The blas, cuda, eigen, metal and onnx backends now have support for multihead
   network architecture and can run BT3/BT4 nets.
@@ -39,6 +136,9 @@
   natively higher draw rates.
 * Made the WDL Rescale sharpness limit configurable via the `--wdl-max-s` hidden
   option.
+* The search task workers can be set automatically, to either 0 for cpu backends
+  or up to 4 depending on the number of cpu cores. This is enabled by
+  `--task-workers=-1` (the new default).
 * Several assorted fixes and code cleanups.
 
 v0.30.0 (2023-07-21)
diff --git a/cross-files/aarch64-linux-android b/cross-files/aarch64-linux-android
index 4a55d838de..75e9e63de9 100644
--- a/cross-files/aarch64-linux-android
+++ b/cross-files/aarch64-linux-android
@@ -1,5 +1,5 @@
 
-# Tested with Android NDK r19c, default toolchain
+# Tested with Android NDK r27c, default toolchain
 # Targeting API level 21
 
 # Set the toolchain path on your environment
@@ -17,8 +17,8 @@ cpp_link_args = ['-llog', '-static-libstdc++']
 [binaries]
 c = 'aarch64-linux-android21-clang'
 cpp = 'aarch64-linux-android21-clang++'
-ar = 'aarch64-linux-android-ar'
-strip = 'aarch64-linux-android-strip'
-ld = 'aarch64-linux-android-ld'
-ranlib = 'aarch64-linux-android-ranlib'
-as = 'aarch64-linux-android-as'
+ar = 'llvm-ar'
+strip = 'llvm-strip'
+ld = 'ld'
+ranlib = 'llvm-ranlib'
+as = 'aarch64-linux-android21-clang'
diff --git a/cross-files/armv7a-linux-android b/cross-files/armv7a-linux-android
index 16b3e93f90..3fed7aee8b 100644
--- a/cross-files/armv7a-linux-android
+++ b/cross-files/armv7a-linux-android
@@ -1,5 +1,5 @@
 
-# Tested with Android NDK r19c, default toolchain
+# Tested with Android NDK r27c, default toolchain
 # Targeting API level 21
 
 # When targeting API levels < 24 the build fails unless _FILE_OFFSET_BITS is unset.
@@ -24,8 +24,8 @@ cpp_link_args = ['-llog', '-static-libstdc++']
 [binaries]
 c = 'armv7a-linux-androideabi21-clang'
 cpp = 'armv7a-linux-androideabi21-clang++'
-ar = 'arm-linux-androideabi-ar'
-strip = 'arm-linux-androideabi-strip'
-ld = 'arm-linux-androideabi-ld'
-ranlib = 'arm-linux-androideabi-ranlib'
-as = 'arm-linux-androideabi-as'
+ar = 'llvm-ar'
+strip = 'llvm-strip'
+ld = 'ld'
+ranlib = 'llvm-ranlib'
+as = 'armv7a-linux-androideabi21-clang'
diff --git a/dist/README-onnx-dml.txt b/dist/README-onnx-dml.txt
index 5e34b3eb52..c86029e5cb 100644
--- a/dist/README-onnx-dml.txt
+++ b/dist/README-onnx-dml.txt
@@ -7,7 +7,7 @@ neural network, specifically those of the LeelaChessZero project
 To run this version you will most likely need a very recent DirectML dll,
 which you can get by running the included `install.cmd` script. Alternatively,
 you can download the currently latest nuget installer package from
-<https://www.nuget.org/api/v2/package/Microsoft.AI.DirectML/1.10.0>.
+<https://www.nuget.org/api/v2/package/Microsoft.AI.DirectML/1.15.4>.
 If you don't know how to use nuget installer packages, you can change the
 extension to .zip and open it as a normal zip file, the dll you need is
 `/bin/x64-win/DirectML.dll`.
diff --git a/dist/README-onnx-trt.txt b/dist/README-onnx-trt.txt
new file mode 100644
index 0000000000..8a50b2689e
--- /dev/null
+++ b/dist/README-onnx-trt.txt
@@ -0,0 +1,88 @@
+# Lc0
+
+Lc0 is a UCI-compliant chess engine designed to play chess via
+neural network, specifically those of the LeelaChessZero project
+(https://lczero.org).
+
+# Installation
+
+Summary: run `instrall.cmd` and follow the instructions.
+
+To run this version you will also need several dll files from NVIDA's
+CUDA, cuDNN and TensorRT. Those dlls can either be on the system path
+from a separate installation of these libraries, or can be placed
+directly in the Lc0 folder. Either way, you will get an error message
+for any that isn't found.
+
+The dlls needed are the following:
+
+1. CUDA
+* cublas64_12.dll
+* cublasLt64_12.dll
+* cudart64_12.dll
+* cufft64_11.dll
+
+2. cuDNN
+* cudnn64_9.dll
+* cudnn_graph64_9.dll
+
+3. TensorRT:
+* nvinfer_10.dll
+* nvinfer_builder_resource_10.dll
+* nvinfer_plugin_10.dll
+* nvonnxparser_10.dll
+
+The install.cmd script included in this package will download the
+CUDA and cuDNN files needed and will open the TensorRT download page
+using your browser. If it fails, you can download the files manually
+using the following addresses, the dlls are in the `bin` directory
+in the CUDA/cuDNN zips and the `lib` directory in the TensorRT zip.
+
+* https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.9.79-archive.zip
+* https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.9.1.4-archive.zip
+* https://developer.download.nvidia.com/compute/cuda/redist/libcufft/windows-x86_64/libcufft-windows-x86_64-11.4.1.4-archive.zip
+* https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.11.0.98_cuda12-archive.zip
+* https://developer.nvidia.com/tensorrt/download/10x#trt1012
+
+The TensorRT link will take you to the download page, after
+registering go to the "TensorRT 10.12 GA for x86_64 Architecture"
+section and get the "TensorRT 10.12 GA for Windows 10, 11,
+Server 2022 and CUDA 12.0 to 12.9 ZIP Package".
+
+Finally, if Lc0 still won't run, get the latest Visual C++
+redistributable from: https://aka.ms/vs/17/release/vc_redist.x64.exe
+
+# Running
+
+When running Lc0 with a new network file, it will take some time to
+create the optimized model to use. This is normal. The model will be
+cached for future runs in the `trt_cache` folder, so next time it will
+be faster. If you want to experiment you can rename the `trt_cache`
+folder and rerun, sometimes TensorRT will generate a different model
+that may be faster. Moreover, if you are having issues, you can
+delete/rename the cache and rerun.
+
+# License
+
+Leela Chess is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+Leela Chess is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+Additional permission under GNU GPL version 3 section 7
+
+If you modify this Program, or any covered work, by linking or
+combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+modified version of those libraries), containing parts covered by the
+terms of the respective license agreement, the licensors of this
+Program grant you additional permission to convey the resulting work.
+
diff --git a/dist/install-cuda_12_9.cmd b/dist/install-cuda_12_9.cmd
new file mode 100644
index 0000000000..c5a253093b
--- /dev/null
+++ b/dist/install-cuda_12_9.cmd
@@ -0,0 +1,43 @@
+@echo off
+where /q tar
+if errorlevel 1 goto error
+
+cd /d %~dp0
+
+cls
+echo Installing the CUDA dlls required by the Lc0 cuda backend.
+
+echo 1/4. Downloading cudart.
+curl -# --ssl-no-revoke -o tmp_cudart.zip https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.9.37-archive.zip"
+if errorlevel 1 goto error
+
+echo 2/4. Extracting files.
+tar -xzOf tmp_cudart.zip cuda_cudart-windows-x86_64-12.9.37-archive/bin/cudart64_12.dll >cudart64_12.dll
+if errorlevel 1 goto error
+
+tar -xzOf tmp_cudart.zip cuda_cudart-windows-x86_64-12.9.37-archive/LICENSE >CUDA.txt
+
+del /q tmp_cudart.zip
+
+echo 3/4. Downloading cublas.
+curl -# --ssl-no-revoke -o tmp_cublas.zip https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.9.0.13-archive.zip"
+if errorlevel 1 goto error
+
+echo 4/4. Extracting files.
+tar -xzOf tmp_cublas.zip libcublas-windows-x86_64-12.9.0.13-archive/bin/cublas64_12.dll >cublas64_12.dll
+if errorlevel 1 goto error
+
+tar -xzOf tmp_cublas.zip libcublas-windows-x86_64-12.9.0.13-archive/bin/cublasLt64_12.dll >cublasLt64_12.dll
+if errorlevel 1 goto error
+
+del /q tmp_cublas.zip
+
+echo Installation successful.
+pause
+exit /b
+
+:error
+cls
+echo Installation failed - you will have to download cuda 12.9 yourself.
+pause
+
diff --git a/dist/install-dml.cmd b/dist/install-dml.cmd
index ca93411a55..d223925866 100644
--- a/dist/install-dml.cmd
+++ b/dist/install-dml.cmd
@@ -6,7 +6,7 @@ cd /d %~dp0
 
 cls
 echo Installing the DirectML.dll version required by the Lc0 onnx-dml backend.
-curl -# --ssl-no-revoke -o tmp_directml.zip https://globalcdn.nuget.org/packages/microsoft.ai.directml.1.10.0.nupkg"
+curl -# --ssl-no-revoke -o tmp_directml.zip https://globalcdn.nuget.org/packages/microsoft.ai.directml.1.15.4.nupkg"
 if errorlevel 1 goto error
 
 tar -xzOf tmp_directml.zip bin/x64-win/DirectML.dll >DirectML.dll
diff --git a/dist/install-trt.cmd b/dist/install-trt.cmd
new file mode 100644
index 0000000000..3538c30b66
--- /dev/null
+++ b/dist/install-trt.cmd
@@ -0,0 +1,99 @@
+@echo off
+where /q tar
+if errorlevel 1 goto error
+
+cd /d %~dp0
+
+cls
+
+echo This script will download and install the CUDA/cuDNN/tensorRT dlls required by the Lc0 onnx-trt backend.
+echo(
+echo If you are using a metered internet connection, be aware the download will be arounbd 3 Gb.
+echo(
+pause
+
+echo Installing the CUDA dlls required by the Lc0 onnx-trt backend.
+
+echo 1/6. Downloading cudart.
+curl -# --ssl-no-revoke -o tmp_cudart.zip https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.9.79-archive.zip"
+if errorlevel 1 goto error
+
+echo 2/6. Extracting files.
+tar -xzOf tmp_cudart.zip cuda_cudart-windows-x86_64-12.9.79-archive/bin/cudart64_12.dll >cudart64_12.dll
+if errorlevel 1 goto error
+
+tar -xzOf tmp_cudart.zip cuda_cudart-windows-x86_64-12.9.79-archive/LICENSE >CUDA.txt
+
+del /q tmp_cudart.zip
+
+echo 3/6. Downloading cublas.
+curl -# --ssl-no-revoke -o tmp_cublas.zip https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.9.1.4-archive.zip"
+if errorlevel 1 goto error
+
+echo 4/6. Extracting files.
+tar -xzOf tmp_cublas.zip libcublas-windows-x86_64-12.9.1.4-archive/bin/cublas64_12.dll >cublas64_12.dll
+if errorlevel 1 goto error
+
+tar -xzOf tmp_cublas.zip libcublas-windows-x86_64-12.9.1.4-archive/bin/cublasLt64_12.dll >cublasLt64_12.dll
+if errorlevel 1 goto error
+
+del /q tmp_cublas.zip
+
+echo 5/6. Downloading cufft.
+curl -# --ssl-no-revoke -o tmp_cufft.zip https://developer.download.nvidia.com/compute/cuda/redist/libcufft/windows-x86_64/libcufft-windows-x86_64-11.4.1.4-archive.zip"
+if errorlevel 1 goto error
+
+echo 6/6. Extracting files.
+tar -xzOf tmp_cufft.zip libcufft-windows-x86_64-11.4.1.4-archive/bin/cufft64_11.dll >cufft64_11.dll
+if errorlevel 1 goto error
+
+del /q tmp_cufft.zip
+
+echo Installing the cuDNN dlls required by the Lc0 onnx-trt backend.
+
+echo 1/2. Downloading cudnn.
+curl -# --ssl-no-revoke -o tmp_cudnn.zip https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.11.0.98_cuda12-archive.zip"
+if errorlevel 1 goto error
+
+echo 2/2. Extracting files.
+tar -xzOf tmp_cudnn.zip cudnn-windows-x86_64-9.11.0.98_cuda12-archive/bin/cudnn64_9.dll >cudnn64_9.dll
+if errorlevel 1 goto error
+
+tar -xzOf tmp_cudnn.zip cudnn-windows-x86_64-9.11.0.98_cuda12-archive/bin/cudnn_graph64_9.dll >cudnn_graph64_9.dll
+if errorlevel 1 goto error
+
+tar -xzOf tmp_cudnn.zip cudnn-windows-x86_64-9.11.0.98_cuda12-archive/LICENSE >CUDNN.txt
+
+del /q tmp_cudnn.zip
+
+echo Installing the tensorRT dlls required by the Lc0 onnx-trt backend.
+
+echo 1/2. Downloading tensorRT.
+curl -# --ssl-no-revoke -o tmp_tensorrt.zip https://developer.download.nvidia.com/compute/machine-learning/tensorrt/10.12.0/zip/TensorRT-10.12.0.36.Windows.win10.cuda-12.9.zip"
+if errorlevel 1 goto error
+
+echo 2/2. Extracting files.
+tar -xzOf tmp_tensorrt.zip TensorRT-10.12.0.36/lib/nvinfer_10.dll >nvinfer_10.dll
+if errorlevel 1 goto error
+
+tar -xzOf tmp_tensorrt.zip TensorRT-10.12.0.36/lib/nvinfer_builder_resource_10.dll >nvinfer_builder_resource_10.dll
+if errorlevel 1 goto error
+
+tar -xzOf tmp_tensorrt.zip TensorRT-10.12.0.36/lib/nvinfer_plugin_10.dll >nvinfer_plugin_10.dll
+if errorlevel 1 goto error
+
+tar -xzOf tmp_tensorrt.zip TensorRT-10.12.0.36/lib/nvonnxparser_10.dll >nvonnxparser_10.dll
+if errorlevel 1 goto error
+
+tar -xzOf tmp_tensorrt.zip TensorRT-10.12.0.36/doc/Readme.txt >TENSORRT.txt
+
+del /q tmp_tensorrt.zip
+
+pause
+exit /b
+
+:error
+cls
+echo Installation failed - see the README for alternative download instructions.
+pause
+
diff --git a/libs/lczero-common b/libs/lczero-common
deleted file mode 160000
index 55e1b382ef..0000000000
--- a/libs/lczero-common
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 55e1b382efadd57903e37f2a2e29caef3ea85799
diff --git a/meson.build b/meson.build
index 27d6c6cb63..63613fb618 100644
--- a/meson.build
+++ b/meson.build
@@ -15,30 +15,22 @@
 # along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
 
 project('lc0', 'cpp',
-        default_options : ['cpp_std=c++17', 'b_ndebug=if-release', 'warning_level=3', 'b_lto=true', 'b_vscrt=mt'],
-        meson_version: '>=0.55')
+        default_options : ['cpp_std=c++20', 'b_ndebug=if-release', 'warning_level=3', 'b_lto=true', 'b_vscrt=mt'],
+        meson_version: '>=0.57')
 
-cc = meson.get_compiler('cpp')
-
-if not cc.has_header('optional') or not cc.has_header('string_view')
-    error('Lc0 requires a compiler supporting C++17, for example g++ v8.0, ' +
-          'clang v5.0 or later (with C++17 stdlib) and Visual Studio 2017 or ' +
-          'later.')
-endif
+fs = import('fs')
 
-if not cc.has_header('charconv')
-    warning('Your compiler or library does not have full C++17 support. ' +
-            'See the README for compilers that are known to be working. ' +
-            'This will become an error in the future.')
-endif
+cc = meson.get_compiler('cpp')
 
 if cc.get_id() == 'clang'
   # Thread safety annotation
   add_project_arguments('-Wthread-safety', language : 'cpp')
 endif
-if cc.get_id() == 'clang' or cc.get_id() == 'gcc'
+if cc.get_id() != 'msvc'
   if get_option('buildtype') == 'release'
-    add_project_arguments(cc.get_supported_arguments(['-march=native']), language : 'cpp')
+     if get_option('native_arch')
+       add_project_arguments(cc.get_supported_arguments(['-march=native']), language : 'cpp')
+     endif
   endif
 endif
 if cc.get_id() == 'msvc'
@@ -47,6 +39,7 @@ if cc.get_id() == 'msvc'
 endif
 if host_machine.system() == 'windows'
   add_project_arguments('-DNOMINMAX', language : 'cpp')
+  add_project_arguments(cc.get_supported_arguments(['/source-charset:utf-8']), language : 'cpp')
 endif
 if ['arm', 'aarch64'].contains(host_machine.cpu_family())
   if get_option('neon')
@@ -69,37 +62,19 @@ includes += include_directories('third_party', is_system: true)
 compile_proto = find_program('scripts/compile_proto.py')
 gen = generator(compile_proto, output: ['@BASENAME@.pb.h'],
   arguments : [
-    '--proto_path=@CURRENT_SOURCE_DIR@/libs/lczero-common',
+    '--proto_path=@CURRENT_SOURCE_DIR@',
     '--cpp_out=@BUILD_DIR@',
     '@INPUT@'])
 
-# Handle submodules.
-git = find_program('git', required: false)
-if run_command('scripts/checkdir.py', 'libs/lczero-common/proto', check : false).returncode() != 0
-  if git.found()
-    if run_command(git, 'status', check : false).returncode() == 0
-      message('updating git submodule libs/lczero-common')
-      run_command(git, 'submodule', 'update', '--init', '--recursive', check : false)
-    else
-      message('cloning lczero-common.git into libs/lczero-common')
-      run_command(git, 'clone', '--depth=1',
-                  'https://github.com/LeelaChessZero/lczero-common.git',
-                  'libs/lczero-common/', check : false)
-    endif
-  else
-    error('Please install git to automatically fetch submodules or download the archives manually from GitHub.')
-  endif
-endif
-
 pb_files = [
   'src/utils/protomessage.cc',
-  gen.process('libs/lczero-common/proto/net.proto',
-    preserve_path_from : meson.current_source_dir() + '/libs/lczero-common/')
+  gen.process('proto/net.proto', preserve_path_from : meson.current_source_dir())
 ]
 common_files += pb_files
 
 # Extract git short revision.
 short_rev = 'unknown'
+git = find_program('git', required: false)
 if git.found()
   r = run_command(git, 'rev-parse', '--short', 'HEAD', check : false)
   if r.returncode() == 0
@@ -141,29 +116,30 @@ elif get_option('malloc') != ''
 endif
 
 # ONNX and HLO protobufs.
-gen_proto_src = generator(compile_proto, output: ['@BASENAME@.pb.h'],
-  arguments : [
-    '--proto_path=@CURRENT_SOURCE_DIR@/src',
-    '--cpp_out=@BUILD_DIR@',
-    '@INPUT@'])
+files += gen.process('proto/onnx.proto',
+  preserve_path_from : meson.current_source_dir())
 
-files += gen_proto_src.process('src/neural/onnx/onnx.proto',
-  preserve_path_from : meson.current_source_dir() + '/src/')
-
-files += gen_proto_src.process('src/neural/xla/hlo.proto',
-  preserve_path_from : meson.current_source_dir() + '/src/')
+files += gen.process('proto/hlo.proto',
+  preserve_path_from : meson.current_source_dir())
 
 #############################################################################
 ## Main files
 #############################################################################
 common_files += [
-  'src/chess/bitboard.cc',
   'src/chess/board.cc',
+  'src/chess/gamestate.cc',
   'src/chess/position.cc',
   'src/chess/uciloop.cc',
-  'src/mcts/node.cc',
+  'src/neural/backend.cc',
+  'src/neural/batchsplit.cc',
   'src/neural/decoder.cc',
   'src/neural/encoder.cc',
+  'src/neural/factory.cc',
+  'src/neural/loader.cc',
+  'src/neural/register.cc',
+  'src/neural/shared_params.cc',
+  'src/neural/wrapper.cc',
+  'src/search/classic/node.cc',
   'src/syzygy/syzygy.cc',
   'src/trainingdata/reader.cc',
   'src/trainingdata/trainingdata.cc',
@@ -181,49 +157,61 @@ common_files += [
 ]
 
 files += [
-  'src/benchmark/backendbench.cc',
-  'src/benchmark/benchmark.cc',
+  'src/engine_loop.cc',
   'src/engine.cc',
-  'src/lc0ctl/describenet.cc',
-  'src/lc0ctl/leela2onnx.cc',
-  'src/lc0ctl/onnx2leela.cc',
-  'src/mcts/params.cc',
-  'src/mcts/search.cc',
-  'src/mcts/stoppers/alphazero.cc',
-  'src/mcts/stoppers/common.cc',
-  'src/mcts/stoppers/factory.cc',
-  'src/mcts/stoppers/legacy.cc',
-  'src/mcts/stoppers/simple.cc',
-  'src/mcts/stoppers/smooth.cc',
-  'src/mcts/stoppers/stoppers.cc',
-  'src/mcts/stoppers/timemgr.cc',
-  'src/neural/cache.cc',
-  'src/neural/factory.cc',
-  'src/neural/loader.cc',
-  'src/neural/network_check.cc',
-  'src/neural/network_demux.cc',
+  'src/neural/backends/network_check.cc',
+  'src/neural/backends/network_demux.cc',
+  'src/neural/backends/network_mux.cc',
+  'src/neural/backends/network_random.cc',
+  'src/neural/backends/network_record.cc',
+  'src/neural/backends/network_rr.cc',
+  'src/neural/backends/network_trivial.cc',
+  'src/neural/memcache.cc',
   'src/neural/network_legacy.cc',
-  'src/neural/network_mux.cc',
-  'src/neural/network_random.cc',
-  'src/neural/network_record.cc',
-  'src/neural/network_rr.cc',
-  'src/neural/network_trivial.cc',
   'src/neural/onnx/adapters.cc',
   'src/neural/onnx/builder.cc',
   'src/neural/onnx/converter.cc',
   'src/neural/xla/hlo_builder.cc',
   'src/neural/xla/onnx2hlo.cc',
   'src/neural/xla/print_hlo.cc',
+  'src/neural/xla/xla_tensor.cc',
+  'src/search/classic/params.cc',
+  'src/search/classic/search.cc',
+  'src/search/classic/stoppers/alphazero.cc',
+  'src/search/classic/stoppers/common.cc',
+  'src/search/classic/stoppers/factory.cc',
+  'src/search/classic/stoppers/legacy.cc',
+  'src/search/classic/stoppers/simple.cc',
+  'src/search/classic/stoppers/smooth.cc',
+  'src/search/classic/stoppers/stoppers.cc',
+  'src/search/classic/stoppers/timemgr.cc',
+  'src/search/classic/wrapper.cc',
+  'src/search/register.cc',
   'src/selfplay/game.cc',
   'src/selfplay/loop.cc',
   'src/selfplay/multigame.cc',
   'src/selfplay/tournament.cc',
+  'src/tools/backendbench.cc',
+  'src/tools/benchmark.cc',
+  'src/tools/describenet.cc',
+  'src/tools/leela2onnx.cc',
+  'src/tools/onnx2leela.cc',
   'src/utils/histogram.cc',
   'src/utils/numa.cc',
   'src/utils/weights_adapter.cc',
 ]
+
+files += [
+  'src/search/instamove/instamove.cc',
+]
+
 includes += include_directories('src')
 
+deps += dependency('absl_flat_hash_map',
+                   include_type: 'system',
+                   fallback: ['abseil-cpp', 'absl_container_dep'],
+                   default_options : ['warning_level=0', 'cpp_std=c++20'])
+
 deps += dependency('threads')
 
 #############################################################################
@@ -235,6 +223,17 @@ else
   common_files += 'src/utils/filesystem.posix.cc'
 endif
 
+#############################################################################
+## DAG CLASSIC SEARCH
+#############################################################################
+if get_option('dag_classic')
+  files += [
+    'src/search/dag_classic/node.cc',
+    'src/search/dag_classic/search.cc',
+    'src/search/dag_classic/wrapper.cc',
+  ]
+endif
+
 #############################################################################
 ## BACKENDS
 #############################################################################
@@ -248,7 +247,7 @@ if get_option('build_backends')
   tf_tensorflow_cc_lib = dependency('tensorflow_cc', required: false)
   if get_option('tensorflow') and tf_dl_lib.found() and tf_tensorflow_cc_lib.found()
     deps += [tf_dl_lib, tf_tensorflow_cc_lib]
-    files += 'src/neural/network_tf_cc.cc'
+    files += 'src/neural/backends/network_tf_cc.cc'
     has_backends = true
   endif
 
@@ -319,7 +318,13 @@ if get_option('build_backends')
 
     endif
 
-    deps += dependency('eigen3', fallback: ['eigen', 'eigen_dep']).as_system()
+    eigen_dep = dependency('eigen3')
+    # Check for needed header, bad dependency seen in the widl.
+    if eigen_dep.found() and cc.has_header('Eigen/Core', dependencies: eigen_dep)
+      deps += eigen_dep.as_system()
+    else
+      deps += subproject('eigen').get_variable('eigen_dep').as_system()
+    endif
 
     ispc = find_program('ispc', required: false)
     ispc_arch = 'x86-64'
@@ -373,25 +378,25 @@ if get_option('build_backends')
     endif
 
     blas_files = [
-    'src/neural/blas/convolution1.cc',
-    'src/neural/blas/fully_connected_layer.cc',
-    'src/neural/blas/se_unit.cc',
-    'src/neural/blas/network_blas.cc',
-    'src/neural/blas/winograd_convolution3.cc'
+    'src/neural/backends/blas/convolution1.cc',
+    'src/neural/backends/blas/fully_connected_layer.cc',
+    'src/neural/backends/blas/se_unit.cc',
+    'src/neural/backends/blas/network_blas.cc',
+    'src/neural/backends/blas/winograd_convolution3.cc'
     ]
 
     shared_files = [
-    'src/neural/shared/activation.cc',
-    'src/neural/shared/winograd_filter.cc',
+    'src/neural/backends/shared/activation.cc',
+    'src/neural/backends/shared/winograd_filter.cc',
     ]
 
     files += blas_files
     has_backends = true
 
     if get_option('ispc') and ispc.found()
-      files += iscp_gen.process('src/neural/blas/winograd_transform.ispc')
-      files += iscp_gen.process('src/neural/blas/layer_norm.ispc')
-      files += iscp_gen.process('src/neural/shared/activation.ispc')
+      files += iscp_gen.process('src/neural/backends/blas/winograd_transform.ispc')
+      files += iscp_gen.process('src/neural/backends/blas/layer_norm.ispc')
+      files += iscp_gen.process('src/neural/backends/shared/activation.ispc')
       add_project_arguments('-DUSE_ISPC', language : 'cpp')
     endif
 
@@ -421,15 +426,15 @@ if get_option('build_backends')
   if get_option('opencl') and has_opencl
 
     opencl_files = [
-      'src/neural/opencl/network_opencl.cc',
-      'src/neural/opencl/OpenCL.cc',
-      'src/neural/opencl/OpenCLTuner.cc',
-      'src/neural/opencl/OpenCLBuffers.cc',
+      'src/neural/backends/opencl/network_opencl.cc',
+      'src/neural/backends/opencl/OpenCL.cc',
+      'src/neural/backends/opencl/OpenCLTuner.cc',
+      'src/neural/backends/opencl/OpenCLBuffers.cc',
     ]
 
     shared_files = [
-    'src/neural/shared/activation.cc',
-    'src/neural/shared/winograd_filter.cc',
+    'src/neural/backends/shared/activation.cc',
+    'src/neural/backends/shared/winograd_filter.cc',
     ]
 
     if not opencl_framework.found()
@@ -447,48 +452,45 @@ if get_option('build_backends')
   ## cuDNN
   ## ~~~~~
   cudnn_libdirs = get_option('cudnn_libdirs')
+  nvcc_paths = []
+  foreach p : cudnn_libdirs
+    nvcc_paths += fs.parent(p) + '/bin/nvcc'
+  endforeach
+  nvcc_paths += ['nvcc', '/usr/local/cuda/bin/nvcc', '/opt/cuda/bin/nvcc']
+  message('Looking for nvcc in: ' + ', '.join(nvcc_paths))
   cu_blas = cc.find_library('cublas', dirs: cudnn_libdirs, required: false)
   cu_dnn = cc.find_library('cudnn', dirs: cudnn_libdirs, required: false)
   cu_dart = cc.find_library('cudart', dirs: cudnn_libdirs, required: false)
-  nvcc = find_program('nvcc', '/usr/local/cuda/bin/nvcc', '/opt/cuda/bin/nvcc',
+  nvcc = find_program(nvcc_paths,
                       required: false)
-
-  if (get_option('cudnn') or get_option('plain_cuda')) and cu_blas.found() and cu_dart.found() and nvcc.found()
-    deps += [cu_blas, cu_dart]
-    cuda_files = ['src/neural/cuda/layers.cc']
-    if get_option('cudnn') and cu_dnn.found()
-      deps += cu_dnn
-      cuda_files += 'src/neural/cuda/network_cudnn.cc'
-      cuda_files += 'src/neural/cuda/network_cuda.cc' # To support newer nets.
-      add_project_arguments('-DUSE_CUDNN', language : 'cpp')
-    elif get_option('plain_cuda')
-      cuda_files += 'src/neural/cuda/network_cuda.cc'
-    endif
+  nvcc_ok = false
+  if get_option('nvcc') and nvcc.found()
     foreach d : get_option('cudnn_include')
       if run_command('scripts/checkdir.py', d, check : false).returncode() == 0
         includes += include_directories(d, is_system: true)
       endif
     endforeach
-    includes += include_directories('src/neural/cuda/')
-
-    cuda_arguments = ['-c', '@INPUT@', '-o', '@OUTPUT@',
+    nvcc_arguments = ['-c', '@INPUT@', '-o', '@OUTPUT@',
                       '-I', meson.current_source_dir() + '/src']
     nvcc_help = run_command(nvcc, '-h', check : false).stdout()
     if host_machine.system() == 'windows'
       if get_option('b_vscrt') == 'mt'
-        cuda_arguments += ['-Xcompiler', '-MT']
+        nvcc_arguments += ['-Xcompiler', '-MT']
       elif get_option('b_vscrt') == 'mtd'
-        cuda_arguments += ['-Xcompiler', '-MTd']
+        nvcc_arguments += ['-Xcompiler', '-MTd']
       elif get_option('b_vscrt') == 'mdd' or (get_option('b_vscrt') == 'from_buildtype' and get_option('buildtype') == 'debug')
-        cuda_arguments += ['-Xcompiler', '-MDd']
+        nvcc_arguments += ['-Xcompiler', '-MDd']
       elif get_option('b_vscrt') != 'none'
-        cuda_arguments += ['-Xcompiler', '-MD']
+        nvcc_arguments += ['-Xcompiler', '-MD']
       endif
     else
-      cuda_arguments += ['--std=c++14', '-Xcompiler', '-fPIC']
+      nvcc_arguments += ['--std=c++17', '-Xcompiler', '-fPIC']
+      if get_option('debug')
+        nvcc_arguments += ['-g']
+      endif
     endif
     if get_option('nvcc_ccbin') != ''
-      cuda_arguments += ['-ccbin=' + get_option('nvcc_ccbin')]
+      nvcc_arguments += ['-ccbin=' + get_option('nvcc_ccbin')]
     endif
     cuda_cc = get_option('cc_cuda') # Unfortunately option cuda_cc is reserved.
     nvcc_extra_args = []
@@ -514,26 +516,68 @@ if get_option('build_backends')
       endif
     endif
     foreach x : get_option('cudnn_include')
-      cuda_arguments += ['-I', x]
+      nvcc_arguments += ['-I', x]
     endforeach
     if host_machine.system() == 'windows'
       outputname = '@BASENAME@.obj'
     else
       outputname = '@BASENAME@.o'
     endif
+    nvcc_ok = true
+
+    max_cuda = 0
+    nvcc_dryrun = run_command(nvcc, '--dryrun', nvcc_extra_args, 'foo.cu', check : false).stderr()
+    foreach x : nvcc_dryrun.split()
+      if x.contains('-D__CUDA_ARCH__=')
+        arch = x.substring(16).to_int()
+        if arch > max_cuda
+          max_cuda = arch
+        endif
+      endif
+    endforeach
+  endif
+  if (get_option('cudnn') or get_option('plain_cuda')) and cu_dart.found() and cu_blas.found() and nvcc_ok
+    deps += [cu_blas, cu_dart]
+    cuda_files = ['src/neural/backends/cuda/layers.cc']
+    if get_option('cudnn') and cu_dnn.found()
+      deps += cu_dnn
+      cuda_files += 'src/neural/backends/cuda/network_cudnn.cc'
+      cuda_files += 'src/neural/backends/cuda/network_cuda.cc' # To support newer nets.
+      add_project_arguments('-DUSE_CUDNN', language : 'cpp')
+    elif get_option('plain_cuda')
+      cuda_files += 'src/neural/backends/cuda/network_cuda.cc'
+    endif
+    includes += include_directories('src/neural/backends/cuda/')
 	 files += cuda_files
+
+    if get_option('cutlass') and max_cuda >= 800
+      add_project_arguments('-DUSE_CUTLASS', language : 'cpp')
+      nvcc_arguments += ['-DUSE_CUTLASS']
+      if get_option('cutlass_include') != ''
+        nvcc_arguments += ['-I', get_option('cutlass_include')]
+      else
+        nvcc_arguments += ['-I', subproject('cutlass').get_variable('include_directory')]
+      endif
+      nvcc_arguments += ['-isystem=@CURRENT_SOURCE_DIR@/third_party']
+      files += custom_target('cuda cutlass code',
+        input : 'src/neural/backends/cuda/cutlass_kernels.cu',
+        output : outputname,
+        command : [nvcc, nvcc_extra_args, nvcc_arguments]
+      )
+    endif
+
     files += custom_target('cuda fp32 code',
-      input : 'src/neural/cuda/common_kernels.cu',
+      input : 'src/neural/backends/cuda/common_kernels.cu',
       output : outputname,
-      depend_files: 'src/neural/cuda/winograd_helper.inc',
-      command : [nvcc, nvcc_extra_args, cuda_arguments]
+      depend_files: 'src/neural/backends/cuda/winograd_helper.inc',
+      command : [nvcc, nvcc_extra_args, nvcc_arguments]
     )
 
     files += custom_target('cuda fp16 code',
-      input : 'src/neural/cuda/fp16_kernels.cu',
+      input : 'src/neural/backends/cuda/fp16_kernels.cu',
       output : outputname,
-      depend_files: 'src/neural/cuda/winograd_helper.inc',
-      command : [nvcc, nvcc_extra_args, cuda_arguments]
+      depend_files: 'src/neural/backends/cuda/winograd_helper.inc',
+      command : [nvcc, nvcc_extra_args, nvcc_arguments]
     )
     has_backends = true
   endif
@@ -548,14 +592,14 @@ if get_option('build_backends')
     dx_dxgi = cc.find_library('dxgi')
 
     dx_files = [
-      'src/neural/dx/network_dx.cc',
-      'src/neural/dx/shader_wrapper.cc',
-      'src/neural/dx/layers_dx.cc',
+      'src/neural/backends/dx/network_dx.cc',
+      'src/neural/backends/dx/shader_wrapper.cc',
+      'src/neural/backends/dx/layers_dx.cc',
     ]
     files += dx_files
     deps += [dx_d3d12, dx_dxgi]
 
-    subdir('src/neural/dx/shaders')
+    subdir('src/neural/backends/dx/shaders')
 
     has_backends = true
   endif
@@ -564,8 +608,8 @@ if get_option('build_backends')
     includes += include_directories(get_option('dnnl_dir') + '/include')
     deps += [ dnnl_lib, dependency('openmp', required:true) ]
     files += [
-      'src/neural/onednn/network_onednn.cc',
-      'src/neural/onednn/layers.cc',
+      'src/neural/backends/onednn/network_onednn.cc',
+      'src/neural/backends/onednn/layers.cc',
     ]
     has_backends = true
   endif
@@ -573,24 +617,47 @@ if get_option('build_backends')
   ## ~~~~~~~~~~
   ## ONNX
   ## ~~~~~~~~~~
-  if get_option('onnx_libdir') != '' and get_option('onnx_include') != ''
-    deps += cc.find_library('onnxruntime', dirs: get_option('onnx_libdir'),
-                            required: true)
-    includes += include_directories(get_option('onnx_include'), is_system: true)
+  onnxruntime = cc.find_library('onnxruntime', dirs: get_option('onnx_libdir'),
+                                required: false)
+  if get_option('onnx') and onnxruntime.found()
+    deps += onnxruntime
+    onnx_inc_dir = get_option('onnx_include')
+    if fs.is_dir(onnx_inc_dir + '/onnxruntime/core/session')
+      # Top level of source dir.
+      onnx_inc_dir += '/onnxruntime/core/session'
+    elif fs.is_dir(onnx_inc_dir + '/onnxruntime')
+      onnx_inc_dir += '/onnxruntime'
+    endif
+    includes += include_directories(onnx_inc_dir, is_system: true)
     cc.has_header('onnxruntime_cxx_api.h', required: true,
-                  args: '-I' + get_option('onnx_include'))
-    if not cc.has_header('cpu_provider_factory.h',
-                         args: '-I' + get_option('onnx_include'))
-      cc.has_header('../providers/cpu/cpu_provider_factory.h', required: true,
-                    args: '-I' + get_option('onnx_include'))
-      includes += include_directories(get_option('onnx_include') + '/../providers/cpu',
-                                      is_system: true)
+                  include_directories: includes)
+    files += 'src/neural/backends/onnx/network_onnx.cc'
+    onnx_conf = configuration_data()
+    if cc.has_header('dml_provider_factory.h', required: false,
+                     include_directories: includes)
+      # The header is not actually needed, used here to detect DML onnxruntime.
+      onnx_conf.set('USE_DML', true)
     endif
-    files += 'src/neural/onnx/network_onnx.cc'
     if cc.find_library('onnxruntime_providers_rocm',
                        dirs: get_option('onnx_libdir'), required: false).found()
-      add_project_arguments('-DUSE_ROCM', language : 'cpp')
+      onnx_conf.set('USE_ROCM', true)
+    endif
+    if cc.find_library('onnxruntime_providers_migraphx',
+                       dirs: get_option('onnx_libdir'), required: false).found()
+      onnx_conf.set('USE_MIGRAPHX', true)
     endif
+    if cu_dart.found() and nvcc_ok
+      onnx_conf.set('USE_ONNX_CUDART', true)
+      deps += cu_dart
+      files += custom_target('cuda onnx code',
+        input : 'src/neural/backends/onnx/onnx_kernels.cu',
+        output : outputname,
+        command : [nvcc, nvcc_extra_args, nvcc_arguments]
+        )
+    else
+      warning('No CUDA support available. Using compatibility implementation for onnx-trt and onnx-cuda.')
+    endif
+    configure_file(output : 'onnx_conf.h', configuration : onnx_conf)
     has_backends = true
   endif
 
@@ -603,35 +670,130 @@ if get_option('build_backends')
                                 modules : ['Foundation', 'Metal', 'MetalPerformanceShaders', 'MetalPerformanceShadersGraph'],
                                 required: get_option('metal'))
 
-  if (metal_frameworks.found() and add_languages('objc', 'objcpp'))
+  if metal_frameworks.found() and add_languages('objc', 'objcpp', native: false)
     deps += metal_frameworks
 
     files += [
-      'src/neural/metal/network_metal.cc',
-      'src/neural/metal/mps/NetworkGraph.mm',
-      'src/neural/metal/mps/MetalNetworkBuilder.mm',
+      'src/neural/backends/metal/network_metal.cc',
+      'src/neural/backends/metal/mps/NetworkGraph.mm',
+      'src/neural/backends/metal/mps/MetalNetworkBuilder.mm',
     ]
 
     has_backends = true
     add_project_arguments('-fobjc-arc', language : 'objc')
     add_project_arguments('-fobjc-arc', language : 'objcpp')
-  endif
 
+    # Minimum MacOS version = 12.6.1
+    macos_min_version = '12.6'
+    add_project_arguments(
+      '-mmacosx-version-min=' + macos_min_version,
+      language: ['c', 'cpp', 'objc', 'objcpp']
+    )
+  endif
 
   ## ~~~~~~~~
   ## XLA
   ## ~~~~~~~~
   if get_option('xla')
       files += [
-        'src/neural/xla/network_xla.cc',
-        'src/neural/xla/pjrt.cc',
-        'src/neural/xla/xla_runner.cc',
-        'src/neural/xla/xla_tensor.cc',
+        'src/neural/backends/xla/network_xla.cc',
+        'src/neural/backends/xla/pjrt.cc',
+        'src/neural/backends/xla/xla_runner.cc',
       ]
       deps += cc.find_library('dl', required: false)
       has_backends = true
   endif
 
+  ## ~~~~
+  ## Sycl
+  ## ~~~~
+  if get_option('sycl') != 'off'
+      has_backends = true
+      message('Building SYCL')
+      add_project_arguments('-fsycl', language : 'cpp')
+      add_project_link_arguments('-fsycl', language : 'cpp')
+
+      files += 'src/neural/backends/sycl/layers.cc.dp.cpp'
+      files += 'src/neural/backends/sycl/network_sycl.cc.dp.cpp'
+      files += 'src/neural/backends/sycl/common_kernels.dp.cpp'
+      files += 'src/neural/backends/sycl/fp16_kernels.dp.cpp'
+
+      if get_option('sycl') == 'l0'
+        message('Building SYCL for the L0 backend')
+        add_project_arguments('-DMKL_ILP64', language : 'cpp')
+        deps += cc.find_library('mkl_sycl', required: true)
+        deps += cc.find_library('mkl_intel_ilp64', required: true)
+        deps += cc.find_library('mkl_sequential', required: true)
+        deps += cc.find_library('mkl_core', required: true)
+        deps += cc.find_library('OpenCL', required: true)
+      elif get_option('sycl') == 'amd'
+        hip_libdirs = get_option('hip_libdirs')
+        hip_args = []
+        foreach hip_include : get_option('hip_include')
+          if run_command('scripts/checkdir.py', hip_include, check : false).returncode() == 0
+            includes += include_directories(hip_include, is_system: true)
+            hip_args += '-I' + hip_include
+          endif
+        endforeach
+        deps += cc.find_library('hipblas', dirs: hip_libdirs, required: true)
+        cc.has_header('hipblas/hipblas.h', required: true, args: hip_args)
+        deps += cc.find_library('amdhip64', dirs: hip_libdirs, required: true)
+        cc.has_header('hip/hip_runtime.h', required: true, args: hip_args)
+        add_project_arguments('-DUSE_HIPBLAS=ON', language : 'cpp')
+        add_project_arguments('-D__HIP_PLATFORM_AMD__', language : 'cpp')
+        amd_gfx = get_option('amd_gfx')
+        if amd_gfx == ''
+          amd_gfx = []
+          agent_enum = find_program('rocm_agent_enumerator', '/opt/rocm/bin/rocm_agent_enumerator',
+                      required: false)
+          if not agent_enum.found()
+            warning( '\'rocm_agent_enumerator\' not found. AMD GPU detection doesn\'t work. You can install rocminfo or set -Damd_gfx.')
+          elif meson.version().version_compare('<1.2.0')
+            warning( 'Automatic AMD GPU detection requires Meson 1.2.0')
+          else
+            agents = run_command(agent_enum, check : false).stdout()
+            agent_list = agents.splitlines()
+            foreach agent : agent_list
+              if agent.startswith('gfx')
+                amd_gfx += 'amd_gpu_' + agent
+              else
+                error( '\'' + agent_enum.full_path() + '\' unexpected output: ' + agent)
+              endif
+            endforeach
+            if amd_gfx.length() == 0
+              warning( '\'' + agent_enum.full_path() + '\' failed to detect any AMD GPUs in the system.')
+            else
+              message( 'Detected AMD GPU cores: ' + ','.join(amd_gfx))
+            endif
+          endif
+        else
+          amd_gfx = ['amd_gpu_' + amd_gfx]
+        endif
+        if amd_gfx.length() == 0
+          error('-Dsycl=amd requires specifying -Damd_gfx architecture identifier (e.g. gfx90a, gfx1100 or similar)')
+        endif
+        add_project_arguments('-fsycl-targets=' + ','.join(amd_gfx), language : 'cpp')
+        add_project_link_arguments('-fsycl-targets=' + ','.join(amd_gfx), language : 'cpp')
+      else
+        deps += cc.find_library('cublas', required: true)
+        deps += cc.find_library('cudart', required: true)
+        add_project_arguments('-DUSE_CUBLAS=ON', language : 'cpp')
+        if get_option('cc_cuda') != ''
+          sycl_nvidia_target = 'nvidia_gpu_sm_' + get_option('cc_cuda')
+        else
+          sycl_nvidia_target = 'nvptx64-nvidia-cuda'
+        endif
+        add_project_arguments('-fsycl-targets='+sycl_nvidia_target, language : 'cpp')
+        add_project_link_arguments('-fsycl-targets='+sycl_nvidia_target, language : 'cpp')
+      endif
+      if host_machine.system() == 'windows'
+        # For sycl under windows we need to link using icx to generate the device code.
+        # This script edits build.ninja for this and for an icx dependency issue.
+        meson.add_postconf_script('scripts/sycl_build_hack.py')
+        add_project_link_arguments('-rtlib=compiler-rt', language : 'cpp')
+      endif
+  endif
+
 endif # if get_option('build_backends')
 
 if not has_backends and get_option('lc0') and get_option('build_backends')
@@ -659,15 +821,53 @@ endif
     deps += dependency('zlib', fallback: ['zlib', 'zlib_dep'])
   endif
 
+  trace_lib = get_option('trace_library')
+  trace_config = configuration_data()
+
+  common_files += 'src/utils/trace.cc'
+  ## ~~~~~~~~
+  ## perfetto
+  ## ~~~~~~~~
+  if trace_lib == 'perfetto'
+    perfetto_dep = dependency('perfetto', required: true,
+                             fallback: ['perfetto', 'dep_perfetto'])
+    deps += perfetto_dep
+    trace_config.set('USE_PERFETTO_TRACE', 1)
+  endif
+
+  ## ~~~~
+  ## nvtx
+  ## ~~~~
+  if trace_lib == 'nvtx'
+    nvtx_includes = get_option('cudnn_include')
+    nvtx_header_found = false
+    foreach d : nvtx_includes
+      if run_command('scripts/checkdir.py', d, check : false).returncode() == 0
+        if cc.has_header('nvtx3/nvtx3.hpp', args: '-I' + d)
+          includes += include_directories(d)
+          nvtx_header_found = true
+          break
+        endif
+      endif
+    endforeach
+    if not nvtx_header_found
+      error('nvtx3/nvtx3.hpp header not found in cudnn_include paths')
+    endif
+    # This could support other tracing apis like systemtap.
+    trace_config.set('USE_NVTX_TRACE', 1)
+  endif
+  configure_file(output : 'trace_config.h',
+                 configuration : trace_config)
+
   ## ~~~~~~~~
   ## Profiler
   ## ~~~~~~~~
   if get_option('buildtype') != 'release'
-    deps += cc.find_library('libprofiler',
+    deps += cc.find_library('profiler',
       dirs: ['/usr/local/lib'], required: false)
   endif
 
-  deps += cc.find_library('libatomic', required: false)
+  deps += cc.find_library('atomic', required: false)
 
 #############################################################################
 ## Main Executable
@@ -681,6 +881,10 @@ if not get_option('f16c')
   add_project_arguments('-DNO_F16C', language : 'cpp')
 endif
 
+if cc.has_type('_Float16')
+  add_project_arguments('-DHAS_FLOAT16', language : 'cpp')
+endif
+
 if not get_option('pext')
   add_project_arguments('-DNO_PEXT', language : 'cpp')
 endif
@@ -689,6 +893,20 @@ if get_option('embed')
   add_project_arguments('-DEMBED', language : 'cpp')
 endif
 
+default_search_h = configuration_data()
+if get_option('default_search') != ''
+  default_search_h.set_quoted('DEFAULT_SEARCH', get_option('default_search'))
+endif
+configure_file(output : 'default_search.h',
+               configuration : default_search_h)
+
+default_backend_h = configuration_data()
+if get_option('default_backend') != ''
+  default_backend_h.set_quoted('DEFAULT_BACKEND', get_option('default_backend'))
+endif
+configure_file(output : 'default_backend.h',
+               configuration : default_backend_h)
+
 if get_option('lc0')
   files += common_files
   executable('lc0', 'src/main.cc',
@@ -700,10 +918,10 @@ endif
 #############################################################################
 
 if get_option('rescorer')
-  deps += subproject('gaviotatb').get_variable('gaviotatb_dep')
+  gaviota_dep = subproject('gaviotatb').get_variable('gaviotatb_dep')
   executable('rescorer', 'src/rescorer_main.cc',
-       [common_files, 'src/rescorer/rescoreloop.cc'],
-       include_directories: includes, dependencies: deps, install: true)
+       [common_files, 'src/trainingdata/rescorer.cc'],
+       include_directories: includes, dependencies: [deps, gaviota_dep], install: true)
 endif
 
 #############################################################################
@@ -712,13 +930,19 @@ endif
 
 if get_option('gtest')
   gtest = dependency('gtest', fallback: ['gtest', 'gtest_dep'])
-  lc0_lib = library('lc0_lib', files, include_directories: includes, dependencies: deps)
+  gmock = dependency('gmock', fallback: ['gtest', 'gmock_dep'])
+  lc0_lib = library('lc0_lib', common_files, include_directories: includes, dependencies: deps)
 
   test('ChessBoard',
     executable('chessboard_test', 'src/chess/board_test.cc',
     include_directories: includes, link_with: lc0_lib, dependencies: gtest
   ), args: '--gtest_output=xml:chessboard.xml', timeout: 90)
 
+  test('FP16',
+    executable('fp16_test', 'src/utils/fp16_utils_test.cc',
+    include_directories: includes, link_with: lc0_lib, dependencies: gtest
+  ), args: '--gtest_output=xml:fp16.xml', timeout: 90)
+
   test('HashCat',
     executable('hashcat_test', 'src/utils/hashcat_test.cc',
     include_directories: includes, link_with: lc0_lib, dependencies: gtest
@@ -744,6 +968,12 @@ if get_option('gtest')
     include_directories: includes, link_with: lc0_lib,
     dependencies: [gtest]
   ), args: '--gtest_output=xml:encoder.xml', timeout: 90)
+
+  test('EngineTest',
+    executable('engine_test', 'src/engine_test.cc', 'src/engine.cc',
+               'src/neural/memcache.cc', pb_files,
+    include_directories: includes, link_with: lc0_lib, dependencies: [gtest, gmock]),
+    args: '--gtest_output=xml:engine_test.xml', timeout: 90)
 endif
 
 
diff --git a/meson_options.txt b/meson_options.txt
index a9f820947b..ec5c53917a 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -43,6 +43,11 @@ option('cudnn_include',
        value: ['/opt/cuda/include/', '/usr/local/cuda/include/', '/usr/lib/cuda/include/'],
        description: 'Paths to cudnn include directory')
 
+option('cutlass_include',
+       type: 'string',
+       value: '',
+       description: 'Paths to cutlass include directory')
+
 option('build_backends',
        type: 'boolean',
        value: true,
@@ -68,9 +73,14 @@ option('native_cuda',
        value: true,
        description: 'build cuda code for native arch only (if supported)')
 
+option('native_arch',
+       type: 'boolean',
+       value: true, 
+       description: 'build code for native arch only')
+
 option('cudnn',
        type: 'boolean',
-       value: true,
+       value: false,
        description: 'Enable cuDNN backend')
 
 option('plain_cuda',
@@ -78,14 +88,19 @@ option('plain_cuda',
        value: true,
        description: 'Enable CUDA backend')
 
-option('opencl',
+option('cutlass',
        type: 'boolean',
        value: true,
+       description: 'Enable cutlass lib for cuda backend. Only supports Ampere+ right now')
+
+option('opencl',
+       type: 'boolean',
+       value: false,
        description: 'Enable OpenCL backend')
 
 option('dx',
        type: 'boolean',
-       value: true,
+       value: false,
        description: 'Enable DirectX12 backend')
 
 option('tensorflow',
@@ -105,7 +120,7 @@ option('openblas',
 
 option('mkl',
        type: 'boolean',
-       value: true,
+       value: false,
        description: 'Enable MKL BLAS support')
 
 option('dnnl',
@@ -178,14 +193,24 @@ option('cc_cuda',
        value: '',
        description: 'Build for a specific cuda CC, e.g. -Dcc_cuda=35 for CC 3.5')
 
-option('onnx_libdir',
+option('amd_gfx',
        type: 'string',
        value: '',
+       description: 'Build for a specific AMD GPU architecture, e.g. -Damd_gfx=gfx90a for gfx90a')
+
+option('onnx',
+       type: 'boolean',
+       value: true,
+       description: 'Enable ONNX backends')
+
+option('onnx_libdir',
+       type: 'string',
+       value: '/usr/lib/',
        description: 'Paths to ONNX runtime libraries')
 
 option('onnx_include',
        type: 'string',
-       value: '',
+       value: '/usr/include/onnxruntime/',
        description: 'Paths to ONNX runtime includes')
 
 option('xla',
@@ -193,6 +218,28 @@ option('xla',
        value: false,
        description: 'Enable XLA backend')
 
+option('sycl',
+       type: 'combo',
+	   choices : ['off', 'l0', 'amd', 'nvidia'],
+       value: 'off',
+       description: 'Enable SYCL backend')
+
+option('hip_libdirs',
+       type: 'array',
+       value: ['/opt/rocm/lib'],
+       description: 'Paths to AMD HIP libraries')
+
+option('hip_include',
+       type: 'array',
+       value: ['/opt/rocm/include'],
+       description: 'Path to AMD HIP includes')
+
+option('trace_library',
+       type: 'combo',
+       choices: ['off', 'perfetto', 'nvtx'],
+       value: 'off',
+       description: 'Enable trace library support')
+
 option('lc0',
        type: 'boolean',
        value: true,
@@ -202,3 +249,23 @@ option('rescorer',
        type: 'boolean',
        value: false,
        description: 'Build rescorer')
+
+option('default_search',
+       type: 'string',
+       value: '',
+       description: 'Default search algorithm to use, e.g. -Ddefault_search=classic')
+
+option('default_backend',
+       type: 'string',
+       value: '',
+       description: 'Default backend to use, e.g. -Ddefault_backend=onnx-trt')
+
+option('dag_classic',
+       type: 'boolean',
+       value: true,
+       description: 'Enable dag-classic search algorithm')
+
+option('nvcc',
+       type: 'boolean',
+       value: true,
+       description: 'Use nvcc: required for cuda, optional for onnx')
diff --git a/src/neural/xla/hlo.proto b/proto/hlo.proto
similarity index 96%
rename from src/neural/xla/hlo.proto
rename to proto/hlo.proto
index 6ced6f938d..ba1fe21653 100644
--- a/src/neural/xla/hlo.proto
+++ b/proto/hlo.proto
@@ -354,6 +354,15 @@ message CompileEnvOptionProto {
   required OptionOverrideProto value = 2;
 }
 
+message XlaDeviceAssignmentProto {
+  optional int32 replica_count = 1;
+  optional int32 computation_count = 2;
+  message ComputationDevice {
+    repeated int64 replica_device_ids = 1;
+  }
+  repeated ComputationDevice computation_devices = 3;
+}
+
 message ExecutableBuildOptionsProto {
   // If set, this is the device to build the computation for. Valid
   // device_ordinal values are: 0 to # of devices - 1. These values are
@@ -386,6 +395,12 @@ message ExecutableBuildOptionsProto {
   // Whether HLOs should be deduplicated.
   optional bool deduplicate_hlo = 8;
 
+  // If set, this specifies a static device assignment for the computation.
+  // Otherwise, the computation will be compiled generically and can be run with
+  // any device assignment compatible with the computation's replica and
+  // partition counts.
+  optional XlaDeviceAssignmentProto device_assignment = 9;
+
   // Whether input and output buffers are aliased if the associated parameter is
   // passed-through XLA modules without being changed.
   optional bool alias_passthrough_params = 10;
diff --git a/proto/net.proto b/proto/net.proto
new file mode 100644
index 0000000000..961a73992a
--- /dev/null
+++ b/proto/net.proto
@@ -0,0 +1,411 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2018 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+syntax = "proto2";
+
+package pblczero;
+
+message EngineVersion {
+  optional uint32 major = 1;
+  optional uint32 minor = 2;
+  optional uint32 patch = 3;
+}
+
+message Weights {
+  message Layer {
+    optional float min_val = 1;
+    optional float max_val = 2;
+    optional bytes params = 3;
+    enum Encoding {
+      UNKNOWN_ENCODING = 0;
+      LINEAR16 = 1;
+      FLOAT16 = 2;
+      BFLOAT16 = 3;
+      FLOAT32 = 4;
+    }
+    optional Encoding encoding = 4;
+    repeated uint32 dims = 5;
+  }
+
+  message ConvBlock {
+    optional Layer weights = 1;
+    optional Layer biases = 2;
+    optional Layer bn_means = 3;
+    optional Layer bn_stddivs = 4;
+    optional Layer bn_gammas = 5;
+    optional Layer bn_betas = 6;
+  }
+
+  message SEunit {
+    // Squeeze-excitation unit (https://arxiv.org/abs/1709.01507)
+    // weights and biases of the two fully connected layers.
+    optional Layer w1 = 1;
+    optional Layer b1 = 2;
+    optional Layer w2 = 3;
+    optional Layer b2 = 4;
+  }
+
+  message Residual {
+    optional ConvBlock conv1 = 1;
+    optional ConvBlock conv2 = 2;
+    optional SEunit se = 3;
+  }
+
+  message Smolgen {
+    // For NETWORK_ATTENTIONBODY_WITH_HEADFORMAT.
+    optional Layer compress = 1;
+    optional Layer dense1_w = 2;
+    optional Layer dense1_b = 3;
+    optional Layer ln1_gammas = 4;
+    optional Layer ln1_betas = 5;
+    optional Layer dense2_w = 6;
+    optional Layer dense2_b = 7;
+    optional Layer ln2_gammas = 8;
+    optional Layer ln2_betas = 9;
+  }
+
+  message MHA {
+    optional Layer q_w = 1;
+    optional Layer q_b = 2;
+    optional Layer k_w = 3;
+    optional Layer k_b = 4;
+    optional Layer v_w = 5;
+    optional Layer v_b = 6;
+    optional Layer dense_w = 7;
+    optional Layer dense_b = 8;
+    optional Smolgen smolgen = 9;
+
+    optional Layer rpe_q = 10;
+    optional Layer rpe_k = 11;
+    optional Layer rpe_v = 12;
+
+    // reserved 13 - 22 for int8 quantization
+  }
+
+  message FFN {
+    optional Layer dense1_w = 1;
+    optional Layer dense1_b = 2;
+    optional Layer dense2_w = 3;
+    optional Layer dense2_b = 4;
+    // reserved 5 - 10 for int8 quantization
+  }
+
+  message EncoderLayer {
+    optional MHA mha = 1;
+    optional Layer ln1_gammas = 2;
+    optional Layer ln1_betas = 3;
+    optional FFN ffn = 4;
+    optional Layer ln2_gammas = 5;
+    optional Layer ln2_betas = 6;
+  }
+
+  message PolicyHead {
+    optional Layer ip_pol_w = 1;
+    optional Layer ip_pol_b = 2;
+    optional Layer ip2_pol_w = 3;  // "wq" in policy attention
+    optional Layer ip2_pol_b = 4;
+    optional Layer ip3_pol_w = 5;  // "wk" in policy attention
+    optional Layer ip3_pol_b = 6;
+    optional Layer ip4_pol_w = 7;  // "ppo" in policy attention
+
+    // Optional policy encoders for policy head.
+    repeated EncoderLayer pol_encoder = 8;
+    optional uint32 pol_headcount = 9;
+
+    // Convolutions for legacy policy head.
+    optional ConvBlock policy1 = 10;
+    optional ConvBlock policy = 11;
+  }
+
+  message ValueHead {
+    optional Layer ip_val_w = 1;  // "embedding" for attention body value
+    optional Layer ip_val_b = 2;
+    optional Layer ip1_val_w = 3;
+    optional Layer ip1_val_b = 4;
+    optional Layer ip2_val_w = 5;
+    optional Layer ip2_val_b = 6;
+    optional Layer ip_val_err_w = 7;
+    optional Layer ip_val_err_b = 8;
+    optional Layer ip_val_cat_w = 9;
+    optional Layer ip_val_cat_b = 10;
+
+    // Legacy value head support.
+    optional ConvBlock value = 11;
+  }
+
+  message PolicyHeadMap {
+    required string key = 1;  // name of the policy head
+    required PolicyHead value = 2;
+  }
+
+  message PolicyHeads {
+    optional Layer ip_pol_w = 1;    // "embedding" in policy attention
+    optional Layer ip_pol_b = 2;
+    optional PolicyHead vanilla = 3;
+    optional PolicyHead optimistic_st = 4;
+    optional PolicyHead soft = 5;
+    optional PolicyHead opponent = 6;
+    // map<string, PolicyHead> policy_head_map = 7;
+    repeated PolicyHeadMap policy_head_map = 7;
+  }
+
+  message ValueHeadMap {
+    required string key = 1;  // name of the value head
+    required ValueHead value = 2;
+  }
+
+  message ValueHeads {
+    optional ValueHead winner = 1;
+    optional ValueHead q = 2;
+    optional ValueHead st = 3;
+    // map<string, ValueHead> value_head_map = 4;
+    repeated ValueHeadMap value_head_map = 4;
+  }
+
+  // Input convnet.
+  optional ConvBlock input = 1;
+
+  // Residual tower.
+  repeated Residual residual = 2;
+
+  // Embedding layer for attention body encoders
+  // (NETWORK_ATTENTIONBODY_WITH_HEADFORMAT).
+
+  optional Layer ip_emb_preproc_w = 37;
+  optional Layer ip_emb_preproc_b = 38;
+
+  optional Layer ip_emb_w = 25;
+  optional Layer ip_emb_b = 26;
+
+  optional Layer ip_emb_ln_gammas = 39;
+  optional Layer ip_emb_ln_betas = 40;
+
+
+
+  // Input gating (NETWORK_ATTENTIONBODY_WITH_HEADFORMAT).
+  optional Layer ip_mult_gate = 33;
+  optional Layer ip_add_gate = 34;
+
+  optional FFN ip_emb_ffn = 41;
+  optional Layer ip_emb_ffn_ln_gammas = 42;
+  optional Layer ip_emb_ffn_ln_betas = 43;
+
+  // Encoder stack (NETWORK_ATTENTIONBODY_WITH_HEADFORMAT).
+  repeated EncoderLayer encoder = 27;
+  optional uint32 headcount = 28;
+
+  // Policy encoder stack
+  // The ffn activation up to and including NETWORK_SE_WITH_HEADFORMAT is SELU,
+  // otherwise it follows the ffn activation setting.
+  repeated EncoderLayer pol_encoder = 21;
+  optional uint32 pol_headcount = 24;
+
+  // Policy head
+  // Extra convolution for AZ-style policy head
+  optional ConvBlock policy1 = 11;
+  optional ConvBlock policy = 3;
+  optional Layer ip_pol_w = 4;    // "embedding" in policy attention
+  optional Layer ip_pol_b = 5;
+  // For policy attention, up to and including NETWORK_SE_WITH_HEADFORMAT the
+  // "embedding" activation is SELU, otherwise it is the default activation.
+  optional Layer ip2_pol_w = 17;  // "wq" in policy attention
+  optional Layer ip2_pol_b = 18;
+  optional Layer ip3_pol_w = 19;  // "wk" in policy attention
+  optional Layer ip3_pol_b = 20;
+  optional Layer ip4_pol_w = 22;  // "ppo" in policy attention
+
+  // Value head
+  optional ConvBlock value = 6;
+  optional Layer ip_val_w = 29;  // "embedding" for attention body value
+  optional Layer ip_val_b = 30;
+  optional Layer ip1_val_w = 7;
+  optional Layer ip1_val_b = 8;
+  optional Layer ip2_val_w = 9;
+  optional Layer ip2_val_b = 10;
+
+  optional ValueHeads value_heads = 44;
+  optional PolicyHeads policy_heads = 45;
+
+  // Moves left head
+  optional ConvBlock moves_left = 12;
+  optional Layer ip_mov_w = 31;  // "embedding" for attention body moves left
+  optional Layer ip_mov_b = 32;
+  optional Layer ip1_mov_w = 13;
+  optional Layer ip1_mov_b = 14;
+  optional Layer ip2_mov_w = 15;
+  optional Layer ip2_mov_b = 16;
+
+  // Global smolgen weights (NETWORK_ATTENTIONBODY_WITH_HEADFORMAT).
+  optional Layer smolgen_w = 35;
+  optional Layer smolgen_b = 36;
+}
+
+message TrainingParams {
+  optional uint32 training_steps = 1;
+  optional float learning_rate = 2;
+  optional float mse_loss = 3;
+  optional float policy_loss = 4;
+  optional float accuracy = 5;
+  optional string lc0_params = 6;
+}
+
+message NetworkFormat {
+  // Format to encode the input planes with. Used by position encoder.
+  enum InputFormat {
+    INPUT_UNKNOWN = 0;
+    INPUT_CLASSICAL_112_PLANE = 1;
+    INPUT_112_WITH_CASTLING_PLANE = 2;
+    INPUT_112_WITH_CANONICALIZATION = 3;
+    INPUT_112_WITH_CANONICALIZATION_HECTOPLIES = 4;
+    INPUT_112_WITH_CANONICALIZATION_HECTOPLIES_ARMAGEDDON = 132;
+    INPUT_112_WITH_CANONICALIZATION_V2 = 5;
+    INPUT_112_WITH_CANONICALIZATION_V2_ARMAGEDDON = 133;
+  }
+  optional InputFormat input = 1;
+
+  // Output format of the NN. Used by search code to interpret results.
+  enum OutputFormat {
+    OUTPUT_UNKNOWN = 0;
+    OUTPUT_CLASSICAL = 1;
+    OUTPUT_WDL = 2;
+  }
+  optional OutputFormat output = 2;
+
+  // Network architecture. Used by backends to build the network.
+  enum NetworkStructure {
+    // Networks without PolicyFormat or ValueFormat specified
+    NETWORK_UNKNOWN = 0;
+    NETWORK_CLASSICAL = 1;
+    NETWORK_SE = 2;
+    // Networks with PolicyFormat and ValueFormat specified
+    NETWORK_CLASSICAL_WITH_HEADFORMAT = 3;
+    NETWORK_SE_WITH_HEADFORMAT = 4;
+    NETWORK_ONNX = 5;
+    NETWORK_ATTENTIONBODY_WITH_HEADFORMAT = 6;
+    NETWORK_ATTENTIONBODY_WITH_MULTIHEADFORMAT = 7;
+    NETWORK_AB_LEGACY_WITH_MULTIHEADFORMAT = 134;
+  }
+  optional NetworkStructure network = 3;
+
+  // Policy head architecture
+  enum PolicyFormat {
+    POLICY_UNKNOWN = 0;
+    POLICY_CLASSICAL = 1;
+    POLICY_CONVOLUTION = 2;
+    POLICY_ATTENTION = 3;
+  }
+  optional PolicyFormat policy = 4;
+
+  // Value head architecture
+  enum ValueFormat {
+    VALUE_UNKNOWN = 0;
+    VALUE_CLASSICAL = 1;
+    VALUE_WDL = 2;
+    VALUE_PARAM = 3;
+  }
+  optional ValueFormat value = 5;
+
+  // Moves left head architecture
+  enum MovesLeftFormat {
+    MOVES_LEFT_NONE = 0;
+    MOVES_LEFT_V1 = 1;
+  }
+  optional MovesLeftFormat moves_left = 6;
+
+  enum ActivationFunction {
+    ACTIVATION_DEFAULT = 0;
+    ACTIVATION_MISH = 1;
+    ACTIVATION_RELU = 2;
+    ACTIVATION_NONE = 3;
+    ACTIVATION_TANH = 4;
+    ACTIVATION_SIGMOID = 5;
+    ACTIVATION_SELU = 6;
+    ACTIVATION_SWISH = 7;
+    ACTIVATION_RELU_2 = 8;
+    ACTIVATION_SOFTMAX = 9;
+  }
+
+  // Activation used everywhere except head outputs or otherwise specified.
+  enum DefaultActivation {
+    DEFAULT_ACTIVATION_RELU = 0;
+    DEFAULT_ACTIVATION_MISH = 1;
+  }
+  optional DefaultActivation default_activation = 7;
+
+  optional ActivationFunction smolgen_activation = 8;
+  optional ActivationFunction ffn_activation = 9;
+
+  enum InputEmbeddingFormat {
+    INPUT_EMBEDDING_NONE = 0;
+    INPUT_EMBEDDING_PE_MAP = 1;
+    INPUT_EMBEDDING_PE_DENSE = 2;
+  }
+  optional InputEmbeddingFormat input_embedding = 10;
+}
+
+message Format {
+  enum Encoding {
+    UNKNOWN = 0;
+    LINEAR16 = 1;
+  }
+  // Any encoding specified in a Layer overides this.
+  optional Encoding weights_encoding = 1;
+  // If network_format is missing, it's assumed to have
+  // INPUT_CLASSICAL_112_PLANE / OUTPUT_CLASSICAL / NETWORK_CLASSICAL format.
+  optional NetworkFormat network_format = 2;
+}
+
+message OnnxModel {
+  enum DataType {
+    UNKNOWN_DATATYPE = 0;
+    FLOAT = 1;
+    FLOAT16 = 10;
+    BFLOAT16 = 16;
+  }
+
+  // Serialized OnnxProto model.
+  optional bytes model = 1;
+  optional DataType data_type = 2;
+  // Name of the input tensor to populate.
+  optional string input_planes = 3;
+  // Names of the output tensors to get results from.
+  // If some feature is not present, corresponding values are not set.
+  optional string output_value = 4;
+  optional string output_wdl = 5;
+  optional string output_policy = 6;
+  optional string output_mlh = 7;
+}
+
+message Net {
+  optional fixed32 magic = 1;
+  optional string license = 2;
+  optional EngineVersion min_version = 3;
+  optional Format format = 4;
+  optional TrainingParams training_params = 5;
+  // Either weights or onnx_model is set, but not both.
+  optional Weights weights = 10;
+  optional OnnxModel onnx_model = 11;
+}
diff --git a/src/neural/onnx/onnx.proto b/proto/onnx.proto
similarity index 100%
rename from src/neural/onnx/onnx.proto
rename to proto/onnx.proto
diff --git a/scripts/appveyor_android_build.cmd b/scripts/appveyor_android_build.cmd
index 9f2f79665a..a9f3f01860 100644
--- a/scripts/appveyor_android_build.cmd
+++ b/scripts/appveyor_android_build.cmd
@@ -1,7 +1,7 @@
 cd arm64-v8a
 ninja
-aarch64-linux-android-strip lc0
+llvm-strip lc0
 cd C:\projects\lc0
 cd armeabi-v7a
 ninja
-arm-linux-androideabi-strip lc0
+llvm-strip lc0
diff --git a/scripts/appveyor_win_build.cmd b/scripts/appveyor_win_build.cmd
index 43ab5f211a..00e739d567 100644
--- a/scripts/appveyor_win_build.cmd
+++ b/scripts/appveyor_win_build.cmd
@@ -1,5 +1,5 @@
 SET PGO=false
-IF %APPVEYOR_REPO_TAG%==true IF %DX%==false IF %ONNX_DML%==false SET PGO=true
+IF %APPVEYOR_REPO_TAG%==true IF %DX%==false IF %ONNX%==false SET PGO=true
 IF %PGO%==false msbuild "C:\projects\lc0\build\lc0.sln" /m /p:WholeProgramOptimization=true /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"
 IF EXIST build\lc0.pdb del build\lc0.pdb
 IF %PGO%==true msbuild "C:\projects\lc0\build\lc0.sln" /m /p:WholeProgramOptimization=PGInstrument /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"
@@ -19,3 +19,12 @@ IF %PGO%==true (
 )
 cd ..
 IF %PGO%==true msbuild "C:\projects\lc0\build\lc0.sln" /m /p:WholeProgramOptimization=PGOptimize /p:DebugInformationFormat=ProgramDatabase /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"
+IF %NAME%==onnx (
+  ren build\lc0.exe lc0-trt.exe
+  meson configure build -Ddefault_backend= -Dcudnn_libdirs= -Dgtest=%GTEST%
+  # This is needed as a separate step.
+  msbuild "C:\projects\lc0\build\lc0.sln" /target:REGEN
+  IF %PGO%==true msbuild "C:\projects\lc0\build\lc0.sln" /m /p:WholeProgramOptimization=PGOptimize /p:DebugInformationFormat=ProgramDatabase /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"
+  IF %PGO%==false msbuild "C:\projects\lc0\build\lc0.sln" /m /p:WholeProgramOptimization=true /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"
+  ren build\lc0.exe lc0-dml.exe
+)
diff --git a/scripts/appveyor_win_package.cmd b/scripts/appveyor_win_package.cmd
index 36f98d8eef..eaf1ba73b7 100644
--- a/scripts/appveyor_win_package.cmd
+++ b/scripts/appveyor_win_package.cmd
@@ -1,6 +1,6 @@
 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip %APPVEYOR_BUILD_FOLDER%\build\lc0.exe
-IF %NAME%==gpu-nvidia-cuda appveyor DownloadFile "https://github.com/LeelaChessZero/lczero-client/releases/latest/download/lc0-training-client.exe"
-IF %NAME%==gpu-nvidia-cuda 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip lc0-training-client.exe
+IF %NAME%==gpu-nvidia-cuda12 appveyor DownloadFile "https://github.com/LeelaChessZero/lczero-client/releases/latest/download/lc0-training-client.exe"
+IF %NAME%==gpu-nvidia-cuda12 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip lc0-training-client.exe
 type COPYING |more /P > dist\COPYING
 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\COPYING
 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip c:\cache\%NET%.pb.gz
@@ -15,26 +15,50 @@ IF %NAME%==cpu-openblas 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip C:\
 IF %NAME%==cpu-dnnl 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip C:\cache\%DNNL_NAME%\bin\dnnl.dll
 IF %NAME%==onednn 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip C:\cache\%DNNL_NAME%\bin\dnnl.dll
 IF %OPENCL%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip C:\cache\opencl-nug.0.777.77\build\native\bin\OpenCL.dll
-IF %CUDNN%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%CUDA_PATH%\bin\cudart64_100.dll" "%CUDA_PATH%\bin\cublas64_100.dll"
+IF %CUDNN%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%CUDA_PATH%\bin\cudart64_101.dll" "%CUDA_PATH%\bin\cublas64_10.dll" "%CUDA_PATH%\bin\cublasLt64_10.dll"
 IF %CUDNN%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%CUDA_PATH%\cuda\bin\cudnn64_7.dll"
-IF %CUDA%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%CUDA_PATH%\bin\cudart64_110.dll" "%CUDA_PATH%\bin\cublas64_11.dll" "%CUDA_PATH%\bin\cublasLt64_11.dll"
-IF %NAME%==cpu-dnnl copy "%PKG_FOLDER%\%DNNL_NAME%\LICENSE" dist\DNNL-LICENSE
-IF %NAME%==cpu-dnnl copy "%PKG_FOLDER%\%DNNL_NAME%\THIRD-PARTY-PROGRAMS" dist\DNNL-THIRD-PARTY-PROGRAMS
-IF %NAME%==cpu-dnnl 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-LICENSE
-IF %NAME%==cpu-dnnl 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-THIRD-PARTY-PROGRAMS
-IF %NAME%==onednn copy "%PKG_FOLDER%\%DNNL_NAME%\LICENSE" dist\DNNL-LICENSE
-IF %NAME%==onednn copy "%PKG_FOLDER%\%DNNL_NAME%\THIRD-PARTY-PROGRAMS" dist\DNNL-THIRD-PARTY-PROGRAMS
-IF %NAME%==onednn 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-LICENSE
-IF %NAME%==onednn 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-THIRD-PARTY-PROGRAMS
-IF %ONNX_DML%==true type dist\README-onnx-dml.txt |more /P > dist\README.txt
-IF %ONNX_DML%==true type dist\install-dml.cmd |more /P > dist\install.cmd
-IF %ONNX_DML%==true copy "%PKG_FOLDER%\%ONNX_NAME%\LICENSE" dist\ONNX-DML-LICENSE
-IF %ONNX_DML%==true copy "%PKG_FOLDER%\%ONNX_NAME%\ThirdPartyNotices.txt" dist\ONNX-DML-ThirdPartyNotices.txt
-IF %ONNX_DML%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%PKG_FOLDER%\%ONNX_NAME%\lib\onnxruntime.dll"
-IF %ONNX_DML%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\README.txt
-IF %ONNX_DML%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\install.cmd
-IF %ONNX_DML%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\ONNX-DML-LICENSE
-IF %ONNX_DML%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\ONNX-DML-ThirdPartyNotices.txt
+IF %NAME%==gpu-nvidia-cuda11 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%CUDA_PATH%\bin\cudart64_110.dll" "%CUDA_PATH%\bin\cublas64_11.dll" "%CUDA_PATH%\bin\cublasLt64_11.dll"
+IF %NAME%==gpu-nvidia-cuda12 (
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%CUDA_PATH%\bin\cudart64_12.dll" "%CUDA_PATH%\bin\cublas64_12.dll" "%CUDA_PATH%\bin\cublasLt64_12.dll"
+  type dist\install-cuda_12_9.cmd |more /P > dist\install.cmd
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-nodll.zip .\dist\install.cmd
+)
+IF %NAME%==cpu-dnnl (
+  copy "%PKG_FOLDER%\%DNNL_NAME%\LICENSE" dist\DNNL-LICENSE
+  copy "%PKG_FOLDER%\%DNNL_NAME%\THIRD-PARTY-PROGRAMS" dist\DNNL-THIRD-PARTY-PROGRAMS
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-LICENSE
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-THIRD-PARTY-PROGRAMS
+)
+IF %NAME%==onednn (
+  copy "%PKG_FOLDER%\%DNNL_NAME%\LICENSE" dist\DNNL-LICENSE
+  copy "%PKG_FOLDER%\%DNNL_NAME%\THIRD-PARTY-PROGRAMS" dist\DNNL-THIRD-PARTY-PROGRAMS
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-LICENSE
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-THIRD-PARTY-PROGRAMS
+)
+IF %ONNX%==true (
+  copy "%PKG_FOLDER%\%ONNX_NAME%\LICENSE" dist\ONNX-LICENSE
+  copy "%PKG_FOLDER%\%ONNX_NAME%\ThirdPartyNotices.txt" dist\ONNX-ThirdPartyNotices.txt
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%PKG_FOLDER%\%ONNX_NAME%\runtimes\win-x64\native\onnxruntime.dll"
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\ONNX-LICENSE
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\ONNX-ThirdPartyNotices.txt
+  copy lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-trt.zip
+  ren lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-dml.zip
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-dml.zip %APPVEYOR_BUILD_FOLDER%\build\lc0-dml.exe
+  7z rn lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-dml.zip lc0-dml.exe lc0.exe
+  type dist\README-onnx-dml.txt |more /P > dist\README.txt
+  type dist\install-dml.cmd |more /P > dist\install.cmd
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-dml.zip .\dist\README.txt
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-dml.zip .\dist\install.cmd
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-trt.zip %APPVEYOR_BUILD_FOLDER%\build\lc0-trt.exe
+  7z rn lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-trt.zip lc0-trt.exe lc0.exe
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-trt.zip "%PKG_FOLDER%\%ONNX_NAME%\runtimes\win-x64\native\onnxruntime_providers_shared.dll"
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-trt.zip "%PKG_FOLDER%\%ONNX_NAME_TWO%\lib\onnxruntime_providers_cuda.dll"
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-trt.zip "%PKG_FOLDER%\%ONNX_NAME_TWO%\lib\onnxruntime_providers_tensorrt.dll"
+  type dist\README-onnx-trt.txt |more /P > dist\README.txt
+  type dist\install-trt.cmd |more /P > dist\install.cmd
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-trt.zip .\dist\README.txt
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-trt.zip .\dist\install.cmd
+)
 IF %OPENCL%==true type scripts\check_opencl.bat |more /P > dist\check_opencl.bat
 IF %OPENCL%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\check_opencl.bat
 IF %DX%==true type scripts\check_dx.bat |more /P > dist\check_dx.bat
diff --git a/scripts/compile_proto.py b/scripts/compile_proto.py
index cb7d0450b2..c6a81996d9 100755
--- a/scripts/compile_proto.py
+++ b/scripts/compile_proto.py
@@ -29,60 +29,73 @@
 import os
 import re
 import sys
+from typing import Any
 
 VARINT_TYPES = {
-    'int32': 'std::int32_t',
-    'int64': 'std::int64_t',
-    'uint32': 'std::uint32_t',
-    'uint64': 'std::uint64_t',
-    'sint32': 'std::uint32_t',
-    'sint64': 'std::uint64_t',
-    'bool': 'bool',
+    "int32": "std::int32_t",
+    "int64": "std::int64_t",
+    "uint32": "std::uint32_t",
+    "uint64": "std::uint64_t",
+    "sint32": "std::uint32_t",
+    "sint64": "std::uint64_t",
+    "bool": "bool",
 }
 
 FIXED64_TYPES = {
-    'fixed64': 'std::uint64_t',
-    'sfixed64': 'std::int64_t',
-    'double': 'double',
+    "fixed64": "std::uint64_t",
+    "sfixed64": "std::int64_t",
+    "double": "double",
 }
 FIXED32_TYPES = {
-    'fixed32': 'std::uint32_t',
-    'sfixed32': 'std::int32_t',
-    'float': 'float',
+    "fixed32": "std::uint32_t",
+    "sfixed32": "std::int32_t",
+    "float": "float",
 }
 BYTES_TYPES = {
-    'string': 'std::string_view',
-    'bytes': 'std::string_view',
+    "string": "std::string_view",
+    "bytes": "std::string_view",
 }
-ZIGZAG_TYPES = set(['sint32', 'sint64'])
-FLOAT_TYPES = set(['float', 'double'])
+ZIGZAG_TYPES = set(["sint32", "sint64"])
+FLOAT_TYPES = set(["float", "double"])
 
 TYPES = {**VARINT_TYPES, **FIXED32_TYPES, **FIXED64_TYPES, **BYTES_TYPES}
 
 RESERVED_WORDS = [
-    'syntax',
-    'package',
-    'message',
-    'optional',
-    'required',
-    'repeated',
-    'enum',
+    "enum",
+    "message",
+    "optional",
+    "package",
+    "repeated",
+    "required",
+    "reserved",
+    "syntax",
+    "to",
 ] + list(TYPES.keys())
 
-GRAMMAR = ([(r'%s\b' % x, x)
-            for x in RESERVED_WORDS] + [('\\' + x, x) for x in '=;{}.'] + [
-                (r'/\*.*?\*/', None),  # /* Comment */
-                (r'//.*?$', None),  # // Comment
-                (r'\s+', None),  # Whitespace
-                (r'$', 'EOF'),
-                (r'"((?:[^"\\]|\\.)*)"', 'string'),
-                (r'\d+', 'number'),
-                (r'\w+', 'identifier'),
-            ])
+GRAMMAR = (
+    [(r"%s\b" % x, x) for x in RESERVED_WORDS]
+    + [("\\" + x, x) for x in "=;{}.,[]"]
+    + [
+        (r"/\*.*?\*/", None),  # /* Comment */
+        (r"//.*?$", None),  # // Comment
+        (r"\s+", None),  # Whitespace
+        (r"$", "EOF"),
+        (r'"((?:[^"\\]|\\.)*)"', "string"),
+        (
+            r"[-+]?(?:[0-9]*\.[0-9]+(?:[eE][-+]?[0-9]+)?|[0-9]+[eE][-+]?[0-9]+)",
+            "fnumber",
+        ),
+        (r"[-+]?\d+", "number"),
+        (r"(\w+)", "identifier"),
+    ]
+)
+
+ALLOWED_ATTRIBUTES = {
+    "default",
+}
 
 
 class Lexer:
-
     def __init__(self, text):
         self.text = text
         self.grammar = [(re.compile(x, re.S + re.M), y) for x, y in GRAMMAR]
@@ -90,31 +103,31 @@ def __init__(self, text):
         self.cur_offset = 0
 
     def Pick(self):
-        '''Picks the last token in queue. Doesn't advance the queue.'''
+        """Picks the last token in queue. Doesn't advance the queue."""
         if self.cur_token is None:
             self.cur_token = self.NextToken()
         return self.cur_token
 
     def Consume(self, expected_token, value=None, group=0):
-        '''Gets the token from the queue and advances the queue.
+        """Gets the token from the queue and advances the queue.
 
         If @expected_token if of wrong type, or @value is not equal to regexes
         @group, throws an error.
-        '''
+        """
         token, match = self.Pick()
         if expected_token != token:
-            self.Error('Expected token type [%s]' % expected_token)
+            self.Error(f"Expected token type [{expected_token}], got [{token}]")
         if value is not None and value != match.group(group):
-            self.Error('Expected value [%s]' % value)
+            self.Error("Expected value [%s]" % value)
         self.cur_offset = match.span()[1]
         self.cur_token = None
         return match
 
     def NextToken(self):
-        '''Reads the stream and returns the next token.
+        """Reads the stream and returns the next token.
 
         (which is not whitespace or comment)
-        '''
+        """
         while True:
             token, match = self.NextTokenOrWhitespace()
             if token is None:
@@ -123,39 +136,42 @@ def NextToken(self):
                 return token, match
 
     def NextTokenOrWhitespace(self):
-        '''Reads the stream and returns the next token (possibly whitespace).'''
+        """Reads the stream and returns the next token (possibly whitespace)."""
         for r, token in self.grammar:
             m = r.match(self.text, self.cur_offset)
             if m:
                 return (token, m)
-        self.Error('Unexpected token')
+        token_snippet = self.text[self.cur_offset : self.cur_offset + 10]
+        self.Error(f"Unparseable token [{token_snippet}...]")
 
     def Error(self, text):
-        '''Throws an error with context in the file read.'''
-        line = self.text[:self.cur_offset].count('\n') + 1
-        line_start = self.text.rfind('\n', 0, self.cur_offset) + 1
-        line_end = self.text.find('\n', line_start)
+        """Throws an error with context in the file read."""
+        line = self.text[: self.cur_offset].count("\n") + 1
+        line_start = self.text.rfind("\n", 0, self.cur_offset) + 1
+        line_end = self.text.find("\n", line_start)
         if line_end == -1:
             line_end = len(self.text)
-        sys.stderr.write('%s:\n' % text)
-        sys.stderr.write(self.text[line_start:line_end] + '\n')
-        sys.stderr.write(' ' * (self.cur_offset - line_start) + '^^^\n')
-        raise ValueError("Parse error: %s at line %d column %d." %
-                         (text, line, (self.cur_offset - line_start)))
+        sys.stderr.write("%s:\n" % text)
+        sys.stderr.write(self.text[line_start:line_end] + "\n")
+        sys.stderr.write(" " * (self.cur_offset - line_start) + "^^^\n")
+        raise ValueError(
+            "Parse error: %s at line %d column %d."
+            % (text, line, (self.cur_offset - line_start))
+        )
 
 
 def ReadIdentifierPath(lexer):
-    '''Reads qualified identifier a.b.d into ['a', 'b', 'd'] list'''
+    """Reads qualified identifier a.b.d into ['a', 'b', 'd'] list"""
     path = []
     while True:
-        path.append(lexer.Consume('identifier').group(0))
-        if lexer.Pick()[0] != '.':
+        path.append(lexer.Consume("identifier").group(0))
+        if lexer.Pick()[0] != ".":
             return path
-        lexer.Consume('.')
+        lexer.Consume(".")
 
 
 def LookupType(name, stack):
-    '''Looks up the (possibly qualified) from the innermost scope first.'''
+    """Looks up the (possibly qualified) from the innermost scope first."""
     for y in stack:
         for x in y:
             if x.GetName() == name[0]:
@@ -163,7 +179,7 @@ def LookupType(name, stack):
                     return x
                 else:
                     return LookupType(name[1:], [x.GetTypes()])
-    raise ValueError("Cannot find type: %s." % '.'.join(name))
+    raise ValueError("Cannot find type: %s." % ".".join(name))
 
 
 # All *Parser classes have the following semantics:
@@ -172,18 +188,17 @@ def LookupType(name, stack):
 
 
 class ProtoTypeParser:
-
     def __init__(self, lexer, object_stack):
         token, match = lexer.Pick()
         if token in TYPES:
-            self.typetype = 'basic'
+            self.typetype = "basic"
             self.name = token
             lexer.Consume(token)
-        elif token == 'identifier':
+        elif token == "identifier":
             self.name = ReadIdentifierPath(lexer)
-            self.typetype = 'forward'
+            self.typetype = "forward"
         else:
-            lexer.Error('Type expected')
+            lexer.Error("Type expected")
 
     def LookupForwardFieldType(self, object_stack):
         if self.IsForward():
@@ -192,41 +207,43 @@ def LookupForwardFieldType(self, object_stack):
             self.name = [typ.GetFullName()]
 
     def IsZigzag(self):
-        if self.typetype == 'basic':
+        if self.typetype == "basic":
             return self.name in ZIGZAG_TYPES
         return False
 
     def GetCppType(self):
-        if self.typetype == 'basic':
+        if self.typetype == "basic":
             return TYPES[self.name]
         else:
-            return '_'.join(self.name)
+            return "_".join(self.name)
 
     def GetVariableCppType(self):
         if self.IsBytesType():
-            return 'std::string'
+            return "std::string"
         else:
             return self.GetCppType()
 
     def IsEnumType(self):
-        return self.typetype == 'enum'
+        return self.typetype == "enum"
 
     def IsVarintType(self):
-        return self.typetype == 'enum' or (self.typetype == 'basic'
-                                           and self.name in VARINT_TYPES)
+        return self.typetype == "enum" or (
+            self.typetype == "basic" and self.name in VARINT_TYPES
+        )
 
     def IsFixedType(self):
-        return self.typetype == 'basic' and (self.name in FIXED64_TYPES
-                                             or self.name in FIXED32_TYPES)
+        return self.typetype == "basic" and (
+            self.name in FIXED64_TYPES or self.name in FIXED32_TYPES
+        )
 
     def IsBytesType(self):
-        return self.typetype == 'basic' and self.name in BYTES_TYPES
+        return self.typetype == "basic" and self.name in BYTES_TYPES
 
     def IsFloatType(self):
-        return self.typetype == 'basic' and self.name in FLOAT_TYPES
+        return self.typetype == "basic" and self.name in FLOAT_TYPES
 
     def GetWireType(self):
-        if self.typetype == 'basic':
+        if self.typetype == "basic":
             if self.name in VARINT_TYPES:
                 return 0
             if self.name in FIXED64_TYPES:
@@ -235,52 +252,84 @@ def GetWireType(self):
                 return 2
             if self.name in FIXED32_TYPES:
                 return 5
-            raise ValueError('Unknown type %s' % self.name)
-        elif self.typetype == 'enum':
+            raise ValueError("Unknown type %s" % self.name)
+        elif self.typetype == "enum":
             return 0
-        elif self.typetype == 'message':
+        elif self.typetype == "message":
             return 2
         else:
-            raise ValueError('Unknown typetype %s' % self.typetype)
+            raise ValueError("Unknown typetype %s" % self.typetype)
 
     def IsMessage(self):
-        return self.typetype == 'message'
+        return self.typetype == "message"
 
     def IsForward(self):
-        return self.typetype == 'forward'
+        return self.typetype == "forward"
 
     def IsIntegralType(self):
-        if self.typetype == 'basic':
-            if self.name == 'double':
+        if self.typetype == "basic":
+            if self.name == "double":
                 return False
-            if self.name == 'float':
+            if self.name == "float":
                 return False
             if self.name in BYTES_TYPES:
                 return False
             if self.name in TYPES:
                 return True
-            raise ValueError('Unknown type %s' % self.name)
-        elif self.typetype == 'enum':
+            raise ValueError("Unknown type %s" % self.name)
+        elif self.typetype == "enum":
             return True
-        elif self.typetype == 'message':
+        elif self.typetype == "message":
             return False
         else:
-            raise ValueError('Unknown typetype %s' % self.typetype)
+            raise ValueError("Unknown typetype %s" % self.typetype)
 
 
 class ProtoFieldParser:
-
     def __init__(self, lexer, object_stack):
         token, match = lexer.Pick()
-        if token not in ['repeated', 'optional', 'required']:
-            lexer.Error('repeated, optional or required expected')
+        if token not in ["repeated", "optional", "required"]:
+            lexer.Error("repeated, optional or required expected")
         self.category = token
         lexer.Consume(token)
         self.type = ProtoTypeParser(lexer, object_stack)
-        self.name = lexer.Consume('identifier')
-        lexer.Consume('=')
-        self.number = int(lexer.Consume('number').group(0))
-        lexer.Consume(';')
+        self.name = lexer.Consume("identifier")
+        lexer.Consume("=")
+        self.number = int(lexer.Consume("number").group(0))
+        self.attributes = ProtoFieldParser.ParseAttributes(lexer)
+        lexer.Consume(";")
+
+    @staticmethod
+    def ParseAttributes(lexer):
+        attributes = {}
+        token, match = lexer.Pick()
+        if token != "[":
+            return attributes
+        lexer.Consume("[")
+        while True:
+            name = lexer.Consume("identifier").group(0)
+            if name not in ALLOWED_ATTRIBUTES:
+                lexer.Error("Unknown attribute %s" % name)
+            lexer.Consume("=")
+            token, match = lexer.Pick()
+            value = None
+            if token == "string":
+                value = lexer.Consume("string").group(0)
+            elif token == "fnumber":
+                value = float(lexer.Consume("fnumber").group(0))
+            elif token == "number":
+                value = int(lexer.Consume("number").group(0))
+            else:
+                lexer.Error("Expected string or number as default value")
+            attributes[name] = value
+            token, _ = lexer.Pick()
+            if token == "]":
+                lexer.Consume("]")
+                return attributes
+            elif token == ",":
+                lexer.Consume(",")
+            else:
+                lexer.Error("Expected ']' or ','")
 
     def IsType(self):
         return False
@@ -291,96 +340,96 @@ def LookupForwardFieldType(self, object_stack):
     def GetParser(self):
         name = self.name.group(0)
         if self.type.IsMessage():
-            if self.category == 'repeated':
-                return 'add_%s()->MergeFromString(val)' % name
+            if self.category == "repeated":
+                return "add_%s()->MergeFromString(val)" % name
             else:
-                return 'mutable_%s()->MergeFromString(val)' % name
+                return "mutable_%s()->MergeFromString(val)" % name
 
         cpp_type = self.type.GetCppType()
-        val = 'NOT IMPLEMENTED!'
+        val = "NOT IMPLEMENTED!"
         if self.type.IsVarintType():
-            val_val = 'UnZigZag(val)' if self.type.IsZigzag() else 'val'
-            val = 'static_cast<%s>(%s)' % (cpp_type, val_val)
+            val_val = "UnZigZag(val)" if self.type.IsZigzag() else "val"
+            val = "static_cast<%s>(%s)" % (cpp_type, val_val)
         elif self.type.IsFixedType():
             if self.type.IsFloatType():
-                val = 'bit_cast<%s>(val)' % cpp_type
+                val = "bit_cast<%s>(val)" % cpp_type
             else:
-                val = 'static_cast<%s>(val)' % cpp_type
+                val = "static_cast<%s>(val)" % cpp_type
         elif self.type.IsBytesType():
-            val = 'val'
+            val = "val"
 
-        if self.category == 'repeated':
-            return '%s_.emplace_back(%s)' % (name, val)
+        if self.category == "repeated":
+            return "%s_.emplace_back(%s)" % (name, val)
         else:
-            return 'set_%s(%s)' % (name, val)
+            return "set_%s(%s)" % (name, val)
 
     def GenerateCaseClause(self, w):
-        w.Write('case %d: %s; break;' % (self.number, self.GetParser()))
+        w.Write("case %d: %s; break;" % (self.number, self.GetParser()))
 
     def GenerateClear(self, w):
         name = self.name.group(0)
-        if self.category == 'repeated':
-            w.Write('%s_.clear();' % name)
+        if self.category == "repeated":
+            w.Write("%s_.clear();" % name)
         else:
-            w.Write('has_%s_ = false;' % name)
-            w.Write('%s_ = {};' % name)
+            w.Write("has_%s_ = false;" % name)
+            if "default" in self.attributes:
+                w.Write("%s_ = %s;" % (name, self.attributes["default"]))
+            else:
+                w.Write("%s_ = {};" % name)
 
     def GenerateOutput(self, w):
         fname = {
-            0: 'AppendVarInt',
-            1: 'AppendInt64',
-            2: 'AppendString',
-            5: 'AppendInt32'
+            0: "AppendVarInt",
+            1: "AppendInt64",
+            2: "AppendString",
+            5: "AppendInt32",
         }
         tname = {
-            0: 'std::uint64_t',
-            1: 'std::uint64_t',
-            2: 'std::string_view',
-            5: 'std::uint32_t'
+            0: "std::uint64_t",
+            1: "std::uint64_t",
+            2: "std::string_view",
+            5: "std::uint32_t",
         }
         wire_id = self.type.GetWireType()
-        if self.category == 'repeated':
-            prefix = 'for (const auto& x : %s)' % (self.name.group(0) + '_')
-            name = 'x'
+        if self.category == "repeated":
+            prefix = "for (const auto& x : %s)" % (self.name.group(0) + "_")
+            name = "x"
         else:
-            name = self.name.group(0) + '_'
-            prefix = 'if (has_%s)' % (name)
+            name = self.name.group(0) + "_"
+            prefix = "if (has_%s)" % (name)
         if self.type.IsMessage():
-            name += '.OutputAsString()'
+            name += ".OutputAsString()"
         elif self.type.IsFloatType():
-            name = 'bit_cast<%s>(%s)' % (tname[wire_id], name)
+            name = "bit_cast<%s>(%s)" % (tname[wire_id], name)
 
-        w.Write('%s %s(%d, %s, &out);' %
-                (prefix, fname[wire_id], self.number, name))
+        w.Write("%s %s(%d, %s, &out);" % (prefix, fname[wire_id], self.number, name))
 
     def GenerateJsonOutput(self, w):
         name = self.name.group(0)
-        if self.category == 'repeated':
-            prefix = 'if (!%s_.empty())' % name
-            funcname = 'AppendJsonRepeatedField'
+        if self.category == "repeated":
+            prefix = "if (!%s_.empty())" % name
+            funcname = "AppendJsonRepeatedField"
         else:
-            prefix = 'if (has_%s_)' % name
-            funcname = 'AppendJsonField'
+            prefix = "if (has_%s_)" % name
+            funcname = "AppendJsonField"
         if self.type.IsEnumType():
-            value = '%s_Name(%s_)' % (self.type.GetCppType(), name)
+            value = "%s_Name(%s_)" % (self.type.GetCppType(), name)
         else:
             value = name + "_"
-        w.Write('%s %s("%s", %s, &first, &out);' %
-                (prefix, funcname, name, value))
+        w.Write('%s %s("%s", %s, &first, &out);' % (prefix, funcname, name, value))
 
     def GenerateFunctionDeclarations(self, w):
         name = self.name.group(0)
         cpp_type = self.type.GetCppType()
         var_cpp_type = self.type.GetVariableCppType()
-        if self.category == 'repeated':
+        if self.category == "repeated":
             if self.type.IsMessage():
                 w.Write("%s* add_%s();" % (cpp_type, name))
             else:
                 w.Write("void add_%s(%s val);" % (name, cpp_type))
             # Using a vector here breaks API compatibility with the standard
             # protobuf library, but it is more convenient.
-            w.Write("const std::vector<%s>& %s() const;" %
-                    (var_cpp_type, name))
+            w.Write("const std::vector<%s>& %s() const;" % (var_cpp_type, name))
             w.Write("std::vector<%s>* mutable_%s();" % (var_cpp_type, name))
             if self.type.IsMessage():
                 w.Write("const %s& %s(size_t idx) const;" % (cpp_type, name))
@@ -392,8 +441,9 @@ def GenerateFunctionDeclarations(self, w):
             w.Write("bool has_%s() const;" % (name))
             if self.type.IsMessage():
                 w.Write("const %s& %s() const;" % (cpp_type, name))
-                w.Write("%s* mutable_%s();" % (cpp_type, name))
-            else:
+            if self.type.IsMessage() or self.type.IsBytesType():
+                w.Write("%s* mutable_%s();" % (var_cpp_type, name))
+            if not self.type.IsMessage():
                 w.Write("%s %s() const;" % (cpp_type, name))
                 w.Write("void set_%s(%s val);" % (name, cpp_type))
 
@@ -401,53 +451,70 @@ def GenerateFunctionDefinitions(self, w, class_name):
         name = self.name.group(0)
         cpp_type = self.type.GetCppType()
         var_cpp_type = self.type.GetVariableCppType()
-        if self.category == 'repeated':
+        if self.category == "repeated":
             if self.type.IsMessage():
                 w.Write(
-                    "inline %s* %s::add_%s() { return &%s_.emplace_back(); }" %
-                    (cpp_type, class_name, name, name))
+                    "inline %s* %s::add_%s() { return &%s_.emplace_back(); }"
+                    % (cpp_type, class_name, name, name)
+                )
             else:
                 w.Write(
                     "inline void %s::add_%s(%s val) { %s_.emplace_back(val); }"
-                    % (class_name, name, cpp_type, name))
+                    % (class_name, name, cpp_type, name)
+                )
             w.Write(
                 "inline const std::vector<%s>& %s::%s() const { return %s_; }"
-                % (var_cpp_type, class_name, name, name))
+                % (var_cpp_type, class_name, name, name)
+            )
             w.Write(
                 "inline std::vector<%s>* %s::mutable_%s() { return &%s_; }"
-                % (var_cpp_type, class_name, name, name))
+                % (var_cpp_type, class_name, name, name)
+            )
             if self.type.IsMessage():
                 w.Write(
                     "inline const %s& %s::%s(size_t idx) const { return %s_[idx]; }"
-                    % (cpp_type, class_name, name, name))
+                    % (cpp_type, class_name, name, name)
+                )
                 w.Write(
                     "inline %s* %s::mutable_%s(size_t idx) { return &%s_[idx]; }"
-                    % (cpp_type, class_name, name, name))
+                    % (cpp_type, class_name, name, name)
+                )
             else:
                 w.Write(
-                    "inline %s %s::%s(size_t idx) const { return %s_[idx]; }" %
-                    (cpp_type, class_name, name, name))
+                    "inline %s %s::%s(size_t idx) const { return %s_[idx]; }"
+                    % (cpp_type, class_name, name, name)
+                )
             w.Write(
-                "inline size_t %s::%s_size() const { return %s_.size(); }" %
-                (class_name, name, name))
+                "inline size_t %s::%s_size() const { return %s_.size(); }"
+                % (class_name, name, name)
+            )
         else:
-            w.Write("inline bool %s::has_%s() const { return has_%s_; }" %
-                    (class_name, name, name))
+            w.Write(
+                "inline bool %s::has_%s() const { return has_%s_; }"
+                % (class_name, name, name)
+            )
             if self.type.IsMessage():
-                w.Write("inline const %s& %s::%s() const { return %s_; }" %
-                        (cpp_type, class_name, name, name))
-                w.Write("inline %s* %s::mutable_%s() {" %
-                        (cpp_type, class_name, name))
+                w.Write(
+                    "inline const %s& %s::%s() const { return %s_; }"
+                    % (cpp_type, class_name, name, name)
+                )
+            if self.type.IsMessage() or self.type.IsBytesType():
+                w.Write(
+                    "inline %s* %s::mutable_%s() {" % (var_cpp_type, class_name, name)
+                )
                 w.Indent()
-                w.Write('has_%s_ = true;' % (name))
-                w.Write('return &%s_;' % name)
+                w.Write("has_%s_ = true;" % (name))
+                w.Write("return &%s_;" % name)
                 w.Unindent()
                 w.Write("}")
-            else:
-                w.Write("inline %s %s::%s() const { return %s_; }" %
-                        (cpp_type, class_name, name, name))
-                w.Write("inline void %s::set_%s(%s val) {" %
-                        (class_name, name, cpp_type))
+            if not self.type.IsMessage():
+                w.Write(
+                    "inline %s %s::%s() const { return %s_; }"
+                    % (cpp_type, class_name, name, name)
+                )
+                w.Write(
+                    "inline void %s::set_%s(%s val) {" % (class_name, name, cpp_type)
+                )
                 w.Indent()
                 w.Write("has_%s_ = true;" % name)
                 w.Write("%s_ = val;" % name)
@@ -457,41 +524,43 @@ def GenerateFunctionDefinitions(self, w, class_name):
     def GenerateVariable(self, w):
         name = self.name.group(0)
         cpp_type = self.type.GetVariableCppType()
-        if self.category == 'repeated':
+        if self.category == "repeated":
             w.Write("std::vector<%s> %s_;" % (cpp_type, name))
         else:
             w.Write("bool has_%s_{};" % (name))
-            w.Write("%s %s_{};" % (cpp_type, name))
+            if "default" in self.attributes:
+                w.Write("%s %s_{%s};" % (cpp_type, name, self.attributes["default"]))
+            else:
+                w.Write("%s %s_{};" % (cpp_type, name))
         return
 
 
 class ProtoEnumParser:
-
     def __init__(self, lexer, scope):
-        lexer.Consume('enum')
-        self.name = lexer.Consume('identifier').group(0)
+        lexer.Consume("enum")
+        self.name = lexer.Consume("identifier").group(0)
         self.values = []
         self.scope = scope[:]
-        lexer.Consume('{')
+        lexer.Consume("{")
         while True:
             token, match = lexer.Pick()
-            if token == '}':
+            if token == "}":
                 break
-            key = lexer.Consume('identifier').group(0)
-            lexer.Consume('=')
-            value = int(lexer.Consume('number').group(0))
-            lexer.Consume(';')
+            key = lexer.Consume("identifier").group(0)
+            lexer.Consume("=")
+            value = int(lexer.Consume("number").group(0))
+            lexer.Consume(";")
             self.values.append((key, value))
-        lexer.Consume('}')
+        lexer.Consume("}")
 
     def GetName(self):
         return self.name
 
     def GetFullName(self):
-        return '_'.join([x.GetName() for x in self.scope] + [self.name])
+        return "_".join([x.GetName() for x in self.scope] + [self.name])
 
     def GetType(self):
-        return 'enum'
+        return "enum"
 
     def IsType(self):
         return True
@@ -510,81 +579,112 @@ def GenerateFunctionDefinitions(self, w):
 
     def GenerateEnumDefinitions(self, w):
         # Protobuf enum is mapped directly to C++ enum.
-        w.Write('enum %s : int {' % self.GetFullName())
+        w.Write("enum %s : int {" % self.GetFullName())
         w.Indent()
         for key, value in self.values:
-            w.Write('%s_%s = %d,' % (self.GetFullName(), key, value))
+            w.Write("%s_%s = %d," % (self.GetFullName(), key, value))
         w.Unindent()
-        w.Write('};')
-        w.Write('inline std::string %s_Name(%s val) {' %
-                (self.GetFullName(), self.GetFullName()))
+        w.Write("};")
+        w.Write(
+            "inline std::string %s_Name(%s val) {"
+            % (self.GetFullName(), self.GetFullName())
+        )
         w.Indent()
-        w.Write('switch (val) {')
+        w.Write("switch (val) {")
         w.Indent()
         for key, _ in self.values:
-            w.Write('case %s_%s:' % (self.GetFullName(), key))
+            w.Write("case %s_%s:" % (self.GetFullName(), key))
             w.Write('  return "%s";' % key)
         w.Unindent()
-        w.Write('};')
+        w.Write("};")
         w.Write('return "%s(" + std::to_string(val) + ")";' % self.name)
         w.Unindent()
-        w.Write('}')
+        w.Write("}")
 
     def GenerateUsingDirectives(self, w):
-        w.Write('using %s = %s;' % (self.name, self.GetFullName()))
+        w.Write("using %s = %s;" % (self.name, self.GetFullName()))
         for key, _ in self.values:
-            w.Write('static constexpr %s %s =' % (self.name, key))
-            w.Write('    %s_%s;' % (self.GetFullName(), key))
-        w.Write('static constexpr std::array<%s,%d> %s_AllValues = {' %
-                (self.name, len(self.values), self.name))
+            w.Write("static constexpr %s %s =" % (self.name, key))
+            w.Write("    %s_%s;" % (self.GetFullName(), key))
+        w.Write(
+            "static constexpr std::array<%s,%d> %s_AllValues = {"
+            % (self.name, len(self.values), self.name)
+        )
         w.Indent()
         for key, _ in self.values:
-            w.Write('%s,' % key)
+            w.Write("%s," % key)
         w.Unindent()
-        w.Write('};')
+        w.Write("};")
         # Static function to convert an enum value to its name.
-        w.Write('static std::string %s_Name(%s val) {' %
-                (self.name, self.name))
+        w.Write("static std::string %s_Name(%s val) {" % (self.name, self.name))
         w.Indent()
-        w.Write('return %s_Name(val);' % (self.GetFullName()))
+        w.Write("return %s_Name(val);" % (self.GetFullName()))
         w.Unindent()
-        w.Write('}')
+        w.Write("}")
 
 
-class ProtoMessageParser:
+def ParseReservedFields(lexer):
+    res = set()
+    lexer.Consume("reserved")
+    while True:
+        token, match = lexer.Pick()
+        if token == "number":
+            num = int(lexer.Consume("number").group(0))
+            if lexer.Pick()[0] == "to":
+                lexer.Consume("to")
+                end = int(lexer.Consume("number").group(0))
+                res.add(range(num, end + 1))
+            else:
+                res.add(num)
+        elif token in ["identifier", "string"]:
+            res.add(lexer.Consume(token).group(1))
+        else:
+            lexer.Error("Expected number or identifier")
+        token, _ = lexer.Pick()
+        if token == ";":
+            lexer.Consume(";")
+            break
+        lexer.Consume(",")
+    return res
+
 
+class ProtoMessageParser:
     def __init__(self, lexer, type_stack, scope):
         type_stack[0].append(self)
+        self.reserved = set()
         self.types = []
         self.fields = []
         self.scope = scope[:]
-        lexer.Consume('message')
-        self.name = lexer.Consume('identifier').group(0)
-        lexer.Consume('{')
+        lexer.Consume("message")
+        self.name = lexer.Consume("identifier").group(0)
+        lexer.Consume("{")
         while True:
             token, match = lexer.Pick()
-            if token == '}':
+            if token == "}":
                 break
-            elif token == 'message':
-                ProtoMessageParser(lexer, [self.types, *type_stack],
-                                   self.scope + [self])
-            elif token == 'enum':
+            elif token == "message":
+                ProtoMessageParser(
+                    lexer, [self.types, *type_stack], self.scope + [self]
+                )
+            elif token == "enum":
                 self.types.append(ProtoEnumParser(lexer, self.scope + [self]))
-            elif token in ['repeated', 'optional', 'required']:
-                self.fields.append(
-                    ProtoFieldParser(lexer, [self.types, *type_stack]))
+            elif token in ["repeated", "optional", "required"]:
+                self.fields.append(ProtoFieldParser(lexer, [self.types, *type_stack]))
+            elif token == "reserved":
+                self.reserved.update(ParseReservedFields(lexer))
             else:
-                lexer.Error('Expected field or type')
-        lexer.Consume('}')
+                lexer.Error("Expected field or type")
+        lexer.Consume("}")
+        self.CheckReserved()
 
     def GetName(self):
         return self.name
 
     def GetFullName(self):
-        return '_'.join([x.GetName() for x in self.scope] + [self.name])
+        return "_".join([x.GetName() for x in self.scope] + [self.name])
 
     def GetType(self):
-        return 'message'
+        return "message"
 
     def IsType(self):
         return True
@@ -598,6 +698,20 @@ def GetFieldsGruppedByWireType(self):
             type_to_fields.setdefault(x.type.GetWireType(), []).append(x)
         return type_to_fields
 
+    def CheckReserved(self):
+        for r in self.reserved:
+            if isinstance(r, int):
+                if any(x.number == r for x in self.fields):
+                    raise ValueError(f"Field number [{r}] is reserved.")
+            elif isinstance(r, range):
+                if any(x.number in r for x in self.fields):
+                    raise ValueError(
+                        f"Field range [{r.start} to {r.stop - 1}] is reserved."
+                    )
+            else:
+                if any(x.name.group(0) == r for x in self.fields):
+                    raise ValueError(f"Field name [{r}] is reserved.")
+
     def ResolveForwardDeclarations(self, type_stack):
         type_stack.append(self.types)
         for x in self.types:
@@ -607,41 +721,44 @@ def ResolveForwardDeclarations(self, type_stack):
         type_stack.pop()
 
     def WriteFieldParserDeclaration(self, w, wire_id, fields):
-        fname = {0: 'SetVarInt', 1: 'SetInt64', 2: 'SetString', 5: 'SetInt32'}
+        fname = {0: "SetVarInt", 1: "SetInt64", 2: "SetString", 5: "SetInt32"}
         tname = {
-            0: 'std::uint64_t',
-            1: 'std::uint64_t',
-            2: 'std::string_view',
-            5: 'std::uint32_t'
+            0: "std::uint64_t",
+            1: "std::uint64_t",
+            2: "std::string_view",
+            5: "std::uint32_t",
         }
-        w.Write('void %s(int field_id, %s val) final;' %
-                (fname[wire_id], tname[wire_id]))
+        w.Write(
+            "void %s(int field_id, %s val) final;" % (fname[wire_id], tname[wire_id])
+        )
 
     def WriteFieldParserDefinition(self, w, wire_id, fields):
-        fname = {0: 'SetVarInt', 1: 'SetInt64', 2: 'SetString', 5: 'SetInt32'}
+        fname = {0: "SetVarInt", 1: "SetInt64", 2: "SetString", 5: "SetInt32"}
         tname = {
-            0: 'std::uint64_t',
-            1: 'std::uint64_t',
-            2: 'std::string_view',
-            5: 'std::uint32_t'
+            0: "std::uint64_t",
+            1: "std::uint64_t",
+            2: "std::string_view",
+            5: "std::uint32_t",
         }
-        w.Write('inline void %s::%s(int field_id, %s val) {' %
-                (self.GetFullName(), fname[wire_id], tname[wire_id]))
+        w.Write(
+            "inline void %s::%s(int field_id, %s val) {"
+            % (self.GetFullName(), fname[wire_id], tname[wire_id])
+        )
         w.Indent()
-        w.Write('switch (field_id) {')
+        w.Write("switch (field_id) {")
         w.Indent()
         for field in fields:
             field.GenerateCaseClause(w)
         w.Unindent()
-        w.Write('}')
+        w.Write("}")
         w.Unindent()
-        w.Write('}')
+        w.Write("}")
 
     def GenerateUsingDirectives(self, w):
-        w.Write('using %s = %s;' % (self.name, self.GetFullName()))
+        w.Write("using %s = %s;" % (self.name, self.GetFullName()))
 
     def GenerateMessageDeclarations(self, w):
-        w.Write(f'class %s;' % self.GetFullName())
+        w.Write(f"class %s;" % self.GetFullName())
         for x in self.types:
             x.GenerateMessageDeclarations(w)
 
@@ -652,42 +769,41 @@ def GenerateEnumDefinitions(self, w):
     def GenerateMessageDefinitions(self, w):
         # Writing nested messages.
         for x in self.types:
-            if x.GetType() == 'message':
+            if x.GetType() == "message":
                 x.GenerateMessageDefinitions(w)
         # Protobuf message is a C++ class.
-        w.Write('class %s final : public lczero::ProtoMessage {' %
-                self.GetFullName())
-        w.Write(' public:')
+        w.Write("class %s final : public lczero::ProtoMessage {" % self.GetFullName())
+        w.Write(" public:")
         w.Indent()
         # Writing using directives.
         for x in self.types:
             x.GenerateUsingDirectives(w)
         # Writing function declarations.
         for x in self.fields:
-            w.Write('')
+            w.Write("")
             x.GenerateFunctionDeclarations(w)
-        w.Write('')
-        w.Write('std::string OutputAsString() const final;')
-        w.Write('std::string OutputAsJson() const final;')
-        w.Write('void Clear() final;')
+        w.Write("")
+        w.Write("std::string OutputAsString() const final;")
+        w.Write("std::string OutputAsJson() const final;")
+        w.Write("void Clear() final;")
 
         w.Unindent()
-        w.Write('')
-        w.Write(' private:')
+        w.Write("")
+        w.Write(" private:")
         w.Indent()
         for k, v in self.GetFieldsGruppedByWireType().items():
             self.WriteFieldParserDeclaration(w, k, v)
-        w.Write('')
+        w.Write("")
         for x in self.fields:
             x.GenerateVariable(w)
         w.Unindent()
-        w.Write('};')
-        w.Write('')
+        w.Write("};")
+        w.Write("")
 
     def GenerateFunctionDefinitions(self, w):
         # Writing nested messages.
         for x in self.types:
-            if x.GetType() == 'message':
+            if x.GetType() == "message":
                 x.GenerateFunctionDefinitions(w)
         self.GenerateOutputAsStringFunc(w)
         self.GenerateOutputAsJsonFunc(w)
@@ -696,37 +812,35 @@ def GenerateFunctionDefinitions(self, w):
         self.GenerateFieldAccessorFuncs(w)
 
     def GenerateOutputAsStringFunc(self, w):
-        w.Write('inline std::string %s::OutputAsString() const {' %
-                self.GetFullName())
+        w.Write("inline std::string %s::OutputAsString() const {" % self.GetFullName())
         w.Indent()
-        w.Write('std::string out;')
+        w.Write("std::string out;")
         for x in sorted(self.fields, key=lambda x: x.number):
             x.GenerateOutput(w)
-        w.Write('return out;')
+        w.Write("return out;")
         w.Unindent()
-        w.Write('}')
+        w.Write("}")
 
     def GenerateOutputAsJsonFunc(self, w):
-        w.Write('inline std::string %s::OutputAsJson() const {' %
-                self.GetFullName())
+        w.Write("inline std::string %s::OutputAsJson() const {" % self.GetFullName())
         w.Indent()
         if self.fields:
-            w.Write('bool first = true;')
+            w.Write("bool first = true;")
         w.Write('std::string out = "{";')
         for x in self.fields:
             x.GenerateJsonOutput(w)
         w.Write('out += "}";')
-        w.Write('return out;')
+        w.Write("return out;")
         w.Unindent()
-        w.Write('}')
+        w.Write("}")
 
     def GenerateClearFunc(self, w):
-        w.Write('inline void %s::Clear() {' % self.GetFullName())
+        w.Write("inline void %s::Clear() {" % self.GetFullName())
         w.Indent()
         for x in self.fields:
             x.GenerateClear(w)
         w.Unindent()
-        w.Write('}')
+        w.Write("}")
 
     def GenerateParserFuncs(self, w):
         for k, v in self.GetFieldsGruppedByWireType().items():
@@ -738,38 +852,38 @@ def GenerateFieldAccessorFuncs(self, w):
 
 
 class ProtoFileParser:
-    '''Root grammar of .proto file'''
+    """Root grammar of .proto file"""
 
     def __init__(self, lexer):
         self.package = None
         self.types = []
         while True:
             token, match = lexer.Pick()
-            if token == 'EOF':
+            if token == "EOF":
                 return
-            elif token == 'syntax':
+            elif token == "syntax":
                 self.ParseSyntax(lexer)
-            elif token == 'package':
+            elif token == "package":
                 self.ParsePackage(lexer)
-            elif token == 'message':
+            elif token == "message":
                 self.ParseMessage(lexer)
-            elif token == 'enum':
+            elif token == "enum":
                 self.ParseEnum(lexer)
             else:
-                lexer.Error('Expected message or something similar')
+                lexer.Error("Expected message or something similar")
 
     def ParseSyntax(self, lexer):
-        lexer.Consume('syntax')
-        lexer.Consume('=')
-        lexer.Consume('string', 'proto2', 1)
-        lexer.Consume(';')
+        lexer.Consume("syntax")
+        lexer.Consume("=")
+        lexer.Consume("string", "proto2", 1)
+        lexer.Consume(";")
 
     def ParsePackage(self, lexer):
-        lexer.Consume('package')
+        lexer.Consume("package")
         if self.package is not None:
-            lexer.Error('Package was already defined')
+            lexer.Error("Package was already defined")
         self.package = ReadIdentifierPath(lexer)
-        lexer.Consume(';')
+        lexer.Consume(";")
 
     def ParseMessage(self, lexer):
         ProtoMessageParser(lexer, [self.types], [])
@@ -778,27 +892,27 @@ def ParseEnum(self, lexer):
         self.types.append(ProtoEnumParser(lexer, []))
 
     def Generate(self, w):
-        w.Write('// This file is AUTOGENERATED, do not edit.')
-        w.Write('#pragma once')
+        w.Write("// This file is AUTOGENERATED, do not edit.")
+        w.Write("#pragma once")
         w.Write('#include "utils/protomessage.h"')
         for x in self.package:
-            w.Write('namespace %s {' % x)
-        w.Write('')
-        w.Write('// Forward declarations.')
+            w.Write("namespace %s {" % x)
+        w.Write("")
+        w.Write("// Forward declarations.")
         for object in self.types:
             object.GenerateMessageDeclarations(w)
         for object in self.types:
             object.GenerateEnumDefinitions(w)
-        w.Write('')
-        w.Write('// Class declarations.')
+        w.Write("")
+        w.Write("// Class declarations.")
         for object in self.types:
             object.GenerateMessageDefinitions(w)
-        w.Write('')
-        w.Write('// Function definitions.')
+        w.Write("")
+        w.Write("// Function definitions.")
         for object in self.types:
             object.GenerateFunctionDefinitions(w)
         for x in reversed(self.package):
-            w.Write('}  // namespace %s' % x)
+            w.Write("}  // namespace %s" % x)
 
     def ResolveForwardDeclarations(self):
         type_stack = [self.types]
@@ -807,7 +921,7 @@ def ResolveForwardDeclarations(self):
 
 
 class Writer:
-    '''A helper class for writing file line by line with indent.'''
+    """A helper class for writing file line by line with indent."""
 
     def __init__(self, fo):
         self.fo = fo
@@ -821,26 +935,26 @@ def Unindent(self):
 
     def Write(self, text):
         if text:
-            self.fo.write(' ' * self.indent + text + '\n')
+            self.fo.write(" " * self.indent + text + "\n")
         else:
-            self.fo.write('\n')
+            self.fo.write("\n")
 
 
 if __name__ == "__main__":
     # Have the same flags as protoc has.
     parser = argparse.ArgumentParser(description="Compile protobuf files.")
-    parser.add_argument('input', type=str)
-    parser.add_argument('--proto_path', type=str)
-    parser.add_argument('--cpp_out', type=str)
+    parser.add_argument("input", type=str)
+    parser.add_argument("--proto_path", type=str)
+    parser.add_argument("--cpp_out", type=str)
     args = parser.parse_args()
 
     rel_path = os.path.relpath(args.input, args.proto_path)
-    dest_name = os.path.splitext(rel_path)[0] + '.pb.h'
+    dest_name = os.path.splitext(rel_path)[0] + ".pb.h"
     dest_path = os.path.join(args.cpp_out, dest_name)
     dest_dir = os.path.dirname(dest_path)
     os.makedirs(dest_dir, exist_ok=True)
 
-    with open(args.input, 'r') as input, open(dest_path, 'w') as output:
+    with open(args.input, "r") as input, open(dest_path, "w") as output:
         proto_file = ProtoFileParser(Lexer(input.read()))
         proto_file.ResolveForwardDeclarations()
         writer = Writer(output)
diff --git a/scripts/sycl_build_hack.py b/scripts/sycl_build_hack.py
new file mode 100644
index 0000000000..e7e3478875
--- /dev/null
+++ b/scripts/sycl_build_hack.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+
+import os
+
+dir = os.getenv('MESON_BUILD_ROOT')
+
+with open(dir + '/build.ninja', 'r') as file:
+  lines = file.readlines()
+
+updated = []
+dep_flag = False
+link_flag = False
+
+for line in lines:
+  # Replace xilink with icx as the linker.
+  if not link_flag:
+    link_flag = 'xilink.exe' in line
+  if link_flag:
+    line = line.replace('xilink.exe', 'icx')
+    line = line.replace('/MACHINE:x64', '')
+    line = line.replace('/OUT:', '-o ')
+    line = line.replace('/SUBSYSTEM:CONSOLE', '')
+    line = line.replace('/OPT:REF', '')
+    line = line.replace('/PDB:', '/Fd')
+  # Replace msvc compatible dependencies with gcc ones as icx output with /showincludes includes
+  # temporary header files causing full project rebuilds.
+  if line.startswith('rule') or line.startswith('build'):
+    dep_flag = 'cpp_COMPILER' in line
+  if dep_flag:
+    line = line.replace('deps = msvc', 'deps = gcc\n depfile = $out.d')
+    line = line.replace('/showIncludes', '/QMD')
+    if 'icx' in line:
+      line = line.replace('/Fo$out', '/Fo$out /QMF$out.d')
+  updated.append(line)
+
+with open(dir + '/build.ninja', 'w') as file:
+  file.writelines(updated)
diff --git a/src/benchmark/backendbench.cc b/src/benchmark/backendbench.cc
deleted file mode 100644
index 6792f9b778..0000000000
--- a/src/benchmark/backendbench.cc
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
-  This file is part of Leela Chess Zero.
-  Copyright (C) 2020-2021 The LCZero Authors
-
-  Leela Chess is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  Leela Chess is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
-
-  Additional permission under GNU GPL version 3 section 7
-
-  If you modify this Program, or any covered work, by linking or
-  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
-  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
-  modified version of those libraries), containing parts covered by the
-  terms of the respective license agreement, the licensors of this
-  Program grant you additional permission to convey the resulting work.
-*/
-
-#include "benchmark/backendbench.h"
-
-#include "chess/board.h"
-#include "mcts/node.h"
-#include "neural/factory.h"
-#include "utils/optionsparser.h"
-
-namespace lczero {
-namespace {
-const int kDefaultThreads = 1;
-
-const OptionId kThreadsOptionId{"threads", "Threads",
-                                "Number of (CPU) worker threads to use.", 't'};
-const OptionId kBatchesId{"batches", "",
-                          "Number of batches to run as a benchmark."};
-const OptionId kStartBatchSizeId{"start-batch-size", "",
-                                 "Start benchmark from this batch size."};
-const OptionId kMaxBatchSizeId{"max-batch-size", "",
-                               "Maximum batch size to benchmark."};
-const OptionId kBatchStepId{"batch-step", "",
-                            "Step of batch size in benchmark."};
-const OptionId kFenId{"fen", "", "Benchmark initial position FEN."};
-
-const OptionId kClippyId{"clippy", "", "Enable helpful assistant."};
-
-void Clippy(std::string title,
-            std::string msg3,  std::string best3, std::string msg2,
-            std::string best2, std::string msg,   std::string best) {
-  std::cout << "  __" << std::endl;
-  std::cout << " /  \\" << std::endl;
-  std::cout << " |  |    " << std::string(title.length()+2, '_') << std::endl;
-  std::cout << " +  +   | " << std::string(title.length()+1, ' ')
-            << "|" << std::endl;
-  std::cout << "(@)(@) _| "
-            << title << " |"
-            << std::endl;
-  std::cout << " |  |  \\  " << std::string(6, ' ') << msg3
-            << std::string(4 - best3.length(), ' ') << best3
-            << std::string(title.length()-33, ' ') << "|" << std::endl;
-  std::cout << " || |/  | " << std::string(6, ' ') << msg2
-            << std::string(4 - best2.length(), ' ') << best2
-            << std::string(title.length()-33, ' ') << "|" << std::endl;
-  std::cout << " || ||  | " << std::string(6, ' ') << msg
-            << std::string(4 - best.length(), ' ') << best
-            << std::string(title.length()-33, ' ') << "|" << std::endl;
-  std::cout << " |\\_/|  |" << std::string(title.length()+2, '_') << "|"
-            << std::endl;
-  std::cout << " \\___/" << std::endl;
-}
-}  // namespace
-
-void BackendBenchmark::Run() {
-  OptionsParser options;
-  NetworkFactory::PopulateOptions(&options);
-  options.Add<IntOption>(kThreadsOptionId, 1, 128) = kDefaultThreads;
-
-  options.Add<IntOption>(kBatchesId, 1, 999999999) = 100;
-  options.Add<IntOption>(kStartBatchSizeId, 1, 1024) = 1;
-  options.Add<IntOption>(kMaxBatchSizeId, 1, 1024) = 256;
-  options.Add<IntOption>(kBatchStepId, 1, 256) = 1;
-  options.Add<StringOption>(kFenId) = ChessBoard::kStartposFen;
-  options.Add<BoolOption>(kClippyId) = false;
-
-  if (!options.ProcessAllFlags()) return;
-
-  try {
-    auto option_dict = options.GetOptionsDict();
-
-    auto network = NetworkFactory::LoadNetwork(option_dict);
-
-    NodeTree tree;
-    tree.ResetToPosition(option_dict.Get<std::string>(kFenId), {});
-
-    // Do any backend initialization outside the loop.
-    auto warmup = network->NewComputation();
-    warmup->AddInput(EncodePositionForNN(
-        network->GetCapabilities().input_format, tree.GetPositionHistory(), 8,
-        FillEmptyHistory::ALWAYS, nullptr));
-    warmup->ComputeBlocking();
-
-    const int batches = option_dict.Get<int>(kBatchesId);
-
-    int best = 1; int best2 = 1; int best3 = 1;
-    float best_nps = 0.0f; float best_nps2 = 0.0f; float best_nps3 = 0.0f;
-    std::optional<std::chrono::time_point<std::chrono::steady_clock>> pending;
-
-    for (int i = option_dict.Get<int>(kStartBatchSizeId);
-         i <= option_dict.Get<int>(kMaxBatchSizeId);
-         i += option_dict.Get<int>(kBatchStepId)) {
-      const auto start = std::chrono::steady_clock::now();
-      // TODO: support threads not equal to 1 to be able to more sensibly test
-      // multiplexing backend.
-      for (int j = 0; j < batches; j++) {
-        // Put i copies of tree root node into computation and compute.
-        auto computation = network->NewComputation();
-        for (int k = 0; k < i; k++) {
-          computation->AddInput(EncodePositionForNN(
-              network->GetCapabilities().input_format,
-              tree.GetPositionHistory(), 8, FillEmptyHistory::ALWAYS, nullptr));
-        }
-        computation->ComputeBlocking();
-      }
-
-      const auto end = std::chrono::steady_clock::now();
-      std::chrono::duration<double> time = end - start;
-      const auto nps = i * batches / time.count();
-      std::cout << "Benchmark batch size " << i
-                << " with inference average time "
-                << time.count() / batches * 1000 << "ms - throughput " << nps
-                << " nps." << std::endl;
-
-      if (option_dict.Get<bool>(kClippyId)) {
-        float nps_ingame  = std::pow((nps + best_nps)  / 2, 1.085);
-        float nps_ingame2 = std::pow((nps + best_nps2) / 2, 1.085);
-        float nps_ingame3 = std::pow((nps + best_nps3) / 2, 1.085);
-        float threshold  = 0.16947 * exp(-4.1695e-6 * nps_ingame  * 180) + 0.02;
-        float threshold2 = 0.16947 * exp(-4.1695e-6 * nps_ingame2 *  15) + 0.02;
-        float threshold3 = 0.16947 * exp(-4.1695e-6 * nps_ingame3 *   1) + 0.02;
-
-        if (nps > best_nps &&
-            threshold * (i - best) * best_nps < (nps - best_nps) * best) {
-          best_nps = nps;
-          best = i;
-          if (threshold2 * (i - best2) * best_nps2 <
-              (nps - best_nps2) * best2) {
-            best_nps2 = nps;
-            best2 = i;
-            if (threshold3 * (i - best3) * best_nps3 <
-                (nps - best_nps3) * best3) {
-              best_nps3 = nps;
-              best3 = i;
-            }
-          }
-          if (!pending) {
-            pending = std::chrono::steady_clock::now();
-          }
-        }
-        if (pending) {
-          time = std::chrono::steady_clock::now() - *pending;
-          if (time.count() > 10) {
-            Clippy(
-                "Recommended minibatch-size for this net (so far):",
-                "1s/move   (Bullet):     ", std::to_string(best3),
-                "15s/move  (Rapid):      ", std::to_string(best2),
-                "3min/move (Tournament): ", std::to_string(best));
-            pending.reset();
-          }
-        }
-      }
-    }
-    if (option_dict.Get<bool>(kClippyId)) {
-        Clippy(
-            "Recommended minibatch-size for this net:",
-            "1s/move   (Bullet):     ", std::to_string(best3),
-            "15s/move  (Rapid):      ", std::to_string(best2),
-            "3min/move (Tournament): ", std::to_string(best));
-    }
-  } catch (Exception& ex) {
-    std::cerr << ex.what() << std::endl;
-  }
-}
-}  // namespace lczero
diff --git a/src/chess/bitboard.cc b/src/chess/bitboard.cc
deleted file mode 100644
index 3402775eec..0000000000
--- a/src/chess/bitboard.cc
+++ /dev/null
@@ -1,365 +0,0 @@
-/*
-  This file is part of Leela Chess Zero.
-  Copyright (C) 2018 The LCZero Authors
-
-  Leela Chess is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  Leela Chess is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
-
-  Additional permission under GNU GPL version 3 section 7
-
-  If you modify this Program, or any covered work, by linking or
-  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
-  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
-  modified version of those libraries), containing parts covered by the
-  terms of the respective license agreement, the licensors of this
-  Program grant you additional permission to convey the resulting work.
-*/
-
-#include "chess/bitboard.h"
-
-#include "utils/exception.h"
-
-namespace lczero {
-
-namespace {
-
-const Move kIdxToMove[] = {
-    "a1b1",  "a1c1",  "a1d1",  "a1e1",  "a1f1",  "a1g1",  "a1h1",  "a1a2",
-    "a1b2",  "a1c2",  "a1a3",  "a1b3",  "a1c3",  "a1a4",  "a1d4",  "a1a5",
-    "a1e5",  "a1a6",  "a1f6",  "a1a7",  "a1g7",  "a1a8",  "a1h8",  "b1a1",
-    "b1c1",  "b1d1",  "b1e1",  "b1f1",  "b1g1",  "b1h1",  "b1a2",  "b1b2",
-    "b1c2",  "b1d2",  "b1a3",  "b1b3",  "b1c3",  "b1d3",  "b1b4",  "b1e4",
-    "b1b5",  "b1f5",  "b1b6",  "b1g6",  "b1b7",  "b1h7",  "b1b8",  "c1a1",
-    "c1b1",  "c1d1",  "c1e1",  "c1f1",  "c1g1",  "c1h1",  "c1a2",  "c1b2",
-    "c1c2",  "c1d2",  "c1e2",  "c1a3",  "c1b3",  "c1c3",  "c1d3",  "c1e3",
-    "c1c4",  "c1f4",  "c1c5",  "c1g5",  "c1c6",  "c1h6",  "c1c7",  "c1c8",
-    "d1a1",  "d1b1",  "d1c1",  "d1e1",  "d1f1",  "d1g1",  "d1h1",  "d1b2",
-    "d1c2",  "d1d2",  "d1e2",  "d1f2",  "d1b3",  "d1c3",  "d1d3",  "d1e3",
-    "d1f3",  "d1a4",  "d1d4",  "d1g4",  "d1d5",  "d1h5",  "d1d6",  "d1d7",
-    "d1d8",  "e1a1",  "e1b1",  "e1c1",  "e1d1",  "e1f1",  "e1g1",  "e1h1",
-    "e1c2",  "e1d2",  "e1e2",  "e1f2",  "e1g2",  "e1c3",  "e1d3",  "e1e3",
-    "e1f3",  "e1g3",  "e1b4",  "e1e4",  "e1h4",  "e1a5",  "e1e5",  "e1e6",
-    "e1e7",  "e1e8",  "f1a1",  "f1b1",  "f1c1",  "f1d1",  "f1e1",  "f1g1",
-    "f1h1",  "f1d2",  "f1e2",  "f1f2",  "f1g2",  "f1h2",  "f1d3",  "f1e3",
-    "f1f3",  "f1g3",  "f1h3",  "f1c4",  "f1f4",  "f1b5",  "f1f5",  "f1a6",
-    "f1f6",  "f1f7",  "f1f8",  "g1a1",  "g1b1",  "g1c1",  "g1d1",  "g1e1",
-    "g1f1",  "g1h1",  "g1e2",  "g1f2",  "g1g2",  "g1h2",  "g1e3",  "g1f3",
-    "g1g3",  "g1h3",  "g1d4",  "g1g4",  "g1c5",  "g1g5",  "g1b6",  "g1g6",
-    "g1a7",  "g1g7",  "g1g8",  "h1a1",  "h1b1",  "h1c1",  "h1d1",  "h1e1",
-    "h1f1",  "h1g1",  "h1f2",  "h1g2",  "h1h2",  "h1f3",  "h1g3",  "h1h3",
-    "h1e4",  "h1h4",  "h1d5",  "h1h5",  "h1c6",  "h1h6",  "h1b7",  "h1h7",
-    "h1a8",  "h1h8",  "a2a1",  "a2b1",  "a2c1",  "a2b2",  "a2c2",  "a2d2",
-    "a2e2",  "a2f2",  "a2g2",  "a2h2",  "a2a3",  "a2b3",  "a2c3",  "a2a4",
-    "a2b4",  "a2c4",  "a2a5",  "a2d5",  "a2a6",  "a2e6",  "a2a7",  "a2f7",
-    "a2a8",  "a2g8",  "b2a1",  "b2b1",  "b2c1",  "b2d1",  "b2a2",  "b2c2",
-    "b2d2",  "b2e2",  "b2f2",  "b2g2",  "b2h2",  "b2a3",  "b2b3",  "b2c3",
-    "b2d3",  "b2a4",  "b2b4",  "b2c4",  "b2d4",  "b2b5",  "b2e5",  "b2b6",
-    "b2f6",  "b2b7",  "b2g7",  "b2b8",  "b2h8",  "c2a1",  "c2b1",  "c2c1",
-    "c2d1",  "c2e1",  "c2a2",  "c2b2",  "c2d2",  "c2e2",  "c2f2",  "c2g2",
-    "c2h2",  "c2a3",  "c2b3",  "c2c3",  "c2d3",  "c2e3",  "c2a4",  "c2b4",
-    "c2c4",  "c2d4",  "c2e4",  "c2c5",  "c2f5",  "c2c6",  "c2g6",  "c2c7",
-    "c2h7",  "c2c8",  "d2b1",  "d2c1",  "d2d1",  "d2e1",  "d2f1",  "d2a2",
-    "d2b2",  "d2c2",  "d2e2",  "d2f2",  "d2g2",  "d2h2",  "d2b3",  "d2c3",
-    "d2d3",  "d2e3",  "d2f3",  "d2b4",  "d2c4",  "d2d4",  "d2e4",  "d2f4",
-    "d2a5",  "d2d5",  "d2g5",  "d2d6",  "d2h6",  "d2d7",  "d2d8",  "e2c1",
-    "e2d1",  "e2e1",  "e2f1",  "e2g1",  "e2a2",  "e2b2",  "e2c2",  "e2d2",
-    "e2f2",  "e2g2",  "e2h2",  "e2c3",  "e2d3",  "e2e3",  "e2f3",  "e2g3",
-    "e2c4",  "e2d4",  "e2e4",  "e2f4",  "e2g4",  "e2b5",  "e2e5",  "e2h5",
-    "e2a6",  "e2e6",  "e2e7",  "e2e8",  "f2d1",  "f2e1",  "f2f1",  "f2g1",
-    "f2h1",  "f2a2",  "f2b2",  "f2c2",  "f2d2",  "f2e2",  "f2g2",  "f2h2",
-    "f2d3",  "f2e3",  "f2f3",  "f2g3",  "f2h3",  "f2d4",  "f2e4",  "f2f4",
-    "f2g4",  "f2h4",  "f2c5",  "f2f5",  "f2b6",  "f2f6",  "f2a7",  "f2f7",
-    "f2f8",  "g2e1",  "g2f1",  "g2g1",  "g2h1",  "g2a2",  "g2b2",  "g2c2",
-    "g2d2",  "g2e2",  "g2f2",  "g2h2",  "g2e3",  "g2f3",  "g2g3",  "g2h3",
-    "g2e4",  "g2f4",  "g2g4",  "g2h4",  "g2d5",  "g2g5",  "g2c6",  "g2g6",
-    "g2b7",  "g2g7",  "g2a8",  "g2g8",  "h2f1",  "h2g1",  "h2h1",  "h2a2",
-    "h2b2",  "h2c2",  "h2d2",  "h2e2",  "h2f2",  "h2g2",  "h2f3",  "h2g3",
-    "h2h3",  "h2f4",  "h2g4",  "h2h4",  "h2e5",  "h2h5",  "h2d6",  "h2h6",
-    "h2c7",  "h2h7",  "h2b8",  "h2h8",  "a3a1",  "a3b1",  "a3c1",  "a3a2",
-    "a3b2",  "a3c2",  "a3b3",  "a3c3",  "a3d3",  "a3e3",  "a3f3",  "a3g3",
-    "a3h3",  "a3a4",  "a3b4",  "a3c4",  "a3a5",  "a3b5",  "a3c5",  "a3a6",
-    "a3d6",  "a3a7",  "a3e7",  "a3a8",  "a3f8",  "b3a1",  "b3b1",  "b3c1",
-    "b3d1",  "b3a2",  "b3b2",  "b3c2",  "b3d2",  "b3a3",  "b3c3",  "b3d3",
-    "b3e3",  "b3f3",  "b3g3",  "b3h3",  "b3a4",  "b3b4",  "b3c4",  "b3d4",
-    "b3a5",  "b3b5",  "b3c5",  "b3d5",  "b3b6",  "b3e6",  "b3b7",  "b3f7",
-    "b3b8",  "b3g8",  "c3a1",  "c3b1",  "c3c1",  "c3d1",  "c3e1",  "c3a2",
-    "c3b2",  "c3c2",  "c3d2",  "c3e2",  "c3a3",  "c3b3",  "c3d3",  "c3e3",
-    "c3f3",  "c3g3",  "c3h3",  "c3a4",  "c3b4",  "c3c4",  "c3d4",  "c3e4",
-    "c3a5",  "c3b5",  "c3c5",  "c3d5",  "c3e5",  "c3c6",  "c3f6",  "c3c7",
-    "c3g7",  "c3c8",  "c3h8",  "d3b1",  "d3c1",  "d3d1",  "d3e1",  "d3f1",
-    "d3b2",  "d3c2",  "d3d2",  "d3e2",  "d3f2",  "d3a3",  "d3b3",  "d3c3",
-    "d3e3",  "d3f3",  "d3g3",  "d3h3",  "d3b4",  "d3c4",  "d3d4",  "d3e4",
-    "d3f4",  "d3b5",  "d3c5",  "d3d5",  "d3e5",  "d3f5",  "d3a6",  "d3d6",
-    "d3g6",  "d3d7",  "d3h7",  "d3d8",  "e3c1",  "e3d1",  "e3e1",  "e3f1",
-    "e3g1",  "e3c2",  "e3d2",  "e3e2",  "e3f2",  "e3g2",  "e3a3",  "e3b3",
-    "e3c3",  "e3d3",  "e3f3",  "e3g3",  "e3h3",  "e3c4",  "e3d4",  "e3e4",
-    "e3f4",  "e3g4",  "e3c5",  "e3d5",  "e3e5",  "e3f5",  "e3g5",  "e3b6",
-    "e3e6",  "e3h6",  "e3a7",  "e3e7",  "e3e8",  "f3d1",  "f3e1",  "f3f1",
-    "f3g1",  "f3h1",  "f3d2",  "f3e2",  "f3f2",  "f3g2",  "f3h2",  "f3a3",
-    "f3b3",  "f3c3",  "f3d3",  "f3e3",  "f3g3",  "f3h3",  "f3d4",  "f3e4",
-    "f3f4",  "f3g4",  "f3h4",  "f3d5",  "f3e5",  "f3f5",  "f3g5",  "f3h5",
-    "f3c6",  "f3f6",  "f3b7",  "f3f7",  "f3a8",  "f3f8",  "g3e1",  "g3f1",
-    "g3g1",  "g3h1",  "g3e2",  "g3f2",  "g3g2",  "g3h2",  "g3a3",  "g3b3",
-    "g3c3",  "g3d3",  "g3e3",  "g3f3",  "g3h3",  "g3e4",  "g3f4",  "g3g4",
-    "g3h4",  "g3e5",  "g3f5",  "g3g5",  "g3h5",  "g3d6",  "g3g6",  "g3c7",
-    "g3g7",  "g3b8",  "g3g8",  "h3f1",  "h3g1",  "h3h1",  "h3f2",  "h3g2",
-    "h3h2",  "h3a3",  "h3b3",  "h3c3",  "h3d3",  "h3e3",  "h3f3",  "h3g3",
-    "h3f4",  "h3g4",  "h3h4",  "h3f5",  "h3g5",  "h3h5",  "h3e6",  "h3h6",
-    "h3d7",  "h3h7",  "h3c8",  "h3h8",  "a4a1",  "a4d1",  "a4a2",  "a4b2",
-    "a4c2",  "a4a3",  "a4b3",  "a4c3",  "a4b4",  "a4c4",  "a4d4",  "a4e4",
-    "a4f4",  "a4g4",  "a4h4",  "a4a5",  "a4b5",  "a4c5",  "a4a6",  "a4b6",
-    "a4c6",  "a4a7",  "a4d7",  "a4a8",  "a4e8",  "b4b1",  "b4e1",  "b4a2",
-    "b4b2",  "b4c2",  "b4d2",  "b4a3",  "b4b3",  "b4c3",  "b4d3",  "b4a4",
-    "b4c4",  "b4d4",  "b4e4",  "b4f4",  "b4g4",  "b4h4",  "b4a5",  "b4b5",
-    "b4c5",  "b4d5",  "b4a6",  "b4b6",  "b4c6",  "b4d6",  "b4b7",  "b4e7",
-    "b4b8",  "b4f8",  "c4c1",  "c4f1",  "c4a2",  "c4b2",  "c4c2",  "c4d2",
-    "c4e2",  "c4a3",  "c4b3",  "c4c3",  "c4d3",  "c4e3",  "c4a4",  "c4b4",
-    "c4d4",  "c4e4",  "c4f4",  "c4g4",  "c4h4",  "c4a5",  "c4b5",  "c4c5",
-    "c4d5",  "c4e5",  "c4a6",  "c4b6",  "c4c6",  "c4d6",  "c4e6",  "c4c7",
-    "c4f7",  "c4c8",  "c4g8",  "d4a1",  "d4d1",  "d4g1",  "d4b2",  "d4c2",
-    "d4d2",  "d4e2",  "d4f2",  "d4b3",  "d4c3",  "d4d3",  "d4e3",  "d4f3",
-    "d4a4",  "d4b4",  "d4c4",  "d4e4",  "d4f4",  "d4g4",  "d4h4",  "d4b5",
-    "d4c5",  "d4d5",  "d4e5",  "d4f5",  "d4b6",  "d4c6",  "d4d6",  "d4e6",
-    "d4f6",  "d4a7",  "d4d7",  "d4g7",  "d4d8",  "d4h8",  "e4b1",  "e4e1",
-    "e4h1",  "e4c2",  "e4d2",  "e4e2",  "e4f2",  "e4g2",  "e4c3",  "e4d3",
-    "e4e3",  "e4f3",  "e4g3",  "e4a4",  "e4b4",  "e4c4",  "e4d4",  "e4f4",
-    "e4g4",  "e4h4",  "e4c5",  "e4d5",  "e4e5",  "e4f5",  "e4g5",  "e4c6",
-    "e4d6",  "e4e6",  "e4f6",  "e4g6",  "e4b7",  "e4e7",  "e4h7",  "e4a8",
-    "e4e8",  "f4c1",  "f4f1",  "f4d2",  "f4e2",  "f4f2",  "f4g2",  "f4h2",
-    "f4d3",  "f4e3",  "f4f3",  "f4g3",  "f4h3",  "f4a4",  "f4b4",  "f4c4",
-    "f4d4",  "f4e4",  "f4g4",  "f4h4",  "f4d5",  "f4e5",  "f4f5",  "f4g5",
-    "f4h5",  "f4d6",  "f4e6",  "f4f6",  "f4g6",  "f4h6",  "f4c7",  "f4f7",
-    "f4b8",  "f4f8",  "g4d1",  "g4g1",  "g4e2",  "g4f2",  "g4g2",  "g4h2",
-    "g4e3",  "g4f3",  "g4g3",  "g4h3",  "g4a4",  "g4b4",  "g4c4",  "g4d4",
-    "g4e4",  "g4f4",  "g4h4",  "g4e5",  "g4f5",  "g4g5",  "g4h5",  "g4e6",
-    "g4f6",  "g4g6",  "g4h6",  "g4d7",  "g4g7",  "g4c8",  "g4g8",  "h4e1",
-    "h4h1",  "h4f2",  "h4g2",  "h4h2",  "h4f3",  "h4g3",  "h4h3",  "h4a4",
-    "h4b4",  "h4c4",  "h4d4",  "h4e4",  "h4f4",  "h4g4",  "h4f5",  "h4g5",
-    "h4h5",  "h4f6",  "h4g6",  "h4h6",  "h4e7",  "h4h7",  "h4d8",  "h4h8",
-    "a5a1",  "a5e1",  "a5a2",  "a5d2",  "a5a3",  "a5b3",  "a5c3",  "a5a4",
-    "a5b4",  "a5c4",  "a5b5",  "a5c5",  "a5d5",  "a5e5",  "a5f5",  "a5g5",
-    "a5h5",  "a5a6",  "a5b6",  "a5c6",  "a5a7",  "a5b7",  "a5c7",  "a5a8",
-    "a5d8",  "b5b1",  "b5f1",  "b5b2",  "b5e2",  "b5a3",  "b5b3",  "b5c3",
-    "b5d3",  "b5a4",  "b5b4",  "b5c4",  "b5d4",  "b5a5",  "b5c5",  "b5d5",
-    "b5e5",  "b5f5",  "b5g5",  "b5h5",  "b5a6",  "b5b6",  "b5c6",  "b5d6",
-    "b5a7",  "b5b7",  "b5c7",  "b5d7",  "b5b8",  "b5e8",  "c5c1",  "c5g1",
-    "c5c2",  "c5f2",  "c5a3",  "c5b3",  "c5c3",  "c5d3",  "c5e3",  "c5a4",
-    "c5b4",  "c5c4",  "c5d4",  "c5e4",  "c5a5",  "c5b5",  "c5d5",  "c5e5",
-    "c5f5",  "c5g5",  "c5h5",  "c5a6",  "c5b6",  "c5c6",  "c5d6",  "c5e6",
-    "c5a7",  "c5b7",  "c5c7",  "c5d7",  "c5e7",  "c5c8",  "c5f8",  "d5d1",
-    "d5h1",  "d5a2",  "d5d2",  "d5g2",  "d5b3",  "d5c3",  "d5d3",  "d5e3",
-    "d5f3",  "d5b4",  "d5c4",  "d5d4",  "d5e4",  "d5f4",  "d5a5",  "d5b5",
-    "d5c5",  "d5e5",  "d5f5",  "d5g5",  "d5h5",  "d5b6",  "d5c6",  "d5d6",
-    "d5e6",  "d5f6",  "d5b7",  "d5c7",  "d5d7",  "d5e7",  "d5f7",  "d5a8",
-    "d5d8",  "d5g8",  "e5a1",  "e5e1",  "e5b2",  "e5e2",  "e5h2",  "e5c3",
-    "e5d3",  "e5e3",  "e5f3",  "e5g3",  "e5c4",  "e5d4",  "e5e4",  "e5f4",
-    "e5g4",  "e5a5",  "e5b5",  "e5c5",  "e5d5",  "e5f5",  "e5g5",  "e5h5",
-    "e5c6",  "e5d6",  "e5e6",  "e5f6",  "e5g6",  "e5c7",  "e5d7",  "e5e7",
-    "e5f7",  "e5g7",  "e5b8",  "e5e8",  "e5h8",  "f5b1",  "f5f1",  "f5c2",
-    "f5f2",  "f5d3",  "f5e3",  "f5f3",  "f5g3",  "f5h3",  "f5d4",  "f5e4",
-    "f5f4",  "f5g4",  "f5h4",  "f5a5",  "f5b5",  "f5c5",  "f5d5",  "f5e5",
-    "f5g5",  "f5h5",  "f5d6",  "f5e6",  "f5f6",  "f5g6",  "f5h6",  "f5d7",
-    "f5e7",  "f5f7",  "f5g7",  "f5h7",  "f5c8",  "f5f8",  "g5c1",  "g5g1",
-    "g5d2",  "g5g2",  "g5e3",  "g5f3",  "g5g3",  "g5h3",  "g5e4",  "g5f4",
-    "g5g4",  "g5h4",  "g5a5",  "g5b5",  "g5c5",  "g5d5",  "g5e5",  "g5f5",
-    "g5h5",  "g5e6",  "g5f6",  "g5g6",  "g5h6",  "g5e7",  "g5f7",  "g5g7",
-    "g5h7",  "g5d8",  "g5g8",  "h5d1",  "h5h1",  "h5e2",  "h5h2",  "h5f3",
-    "h5g3",  "h5h3",  "h5f4",  "h5g4",  "h5h4",  "h5a5",  "h5b5",  "h5c5",
-    "h5d5",  "h5e5",  "h5f5",  "h5g5",  "h5f6",  "h5g6",  "h5h6",  "h5f7",
-    "h5g7",  "h5h7",  "h5e8",  "h5h8",  "a6a1",  "a6f1",  "a6a2",  "a6e2",
-    "a6a3",  "a6d3",  "a6a4",  "a6b4",  "a6c4",  "a6a5",  "a6b5",  "a6c5",
-    "a6b6",  "a6c6",  "a6d6",  "a6e6",  "a6f6",  "a6g6",  "a6h6",  "a6a7",
-    "a6b7",  "a6c7",  "a6a8",  "a6b8",  "a6c8",  "b6b1",  "b6g1",  "b6b2",
-    "b6f2",  "b6b3",  "b6e3",  "b6a4",  "b6b4",  "b6c4",  "b6d4",  "b6a5",
-    "b6b5",  "b6c5",  "b6d5",  "b6a6",  "b6c6",  "b6d6",  "b6e6",  "b6f6",
-    "b6g6",  "b6h6",  "b6a7",  "b6b7",  "b6c7",  "b6d7",  "b6a8",  "b6b8",
-    "b6c8",  "b6d8",  "c6c1",  "c6h1",  "c6c2",  "c6g2",  "c6c3",  "c6f3",
-    "c6a4",  "c6b4",  "c6c4",  "c6d4",  "c6e4",  "c6a5",  "c6b5",  "c6c5",
-    "c6d5",  "c6e5",  "c6a6",  "c6b6",  "c6d6",  "c6e6",  "c6f6",  "c6g6",
-    "c6h6",  "c6a7",  "c6b7",  "c6c7",  "c6d7",  "c6e7",  "c6a8",  "c6b8",
-    "c6c8",  "c6d8",  "c6e8",  "d6d1",  "d6d2",  "d6h2",  "d6a3",  "d6d3",
-    "d6g3",  "d6b4",  "d6c4",  "d6d4",  "d6e4",  "d6f4",  "d6b5",  "d6c5",
-    "d6d5",  "d6e5",  "d6f5",  "d6a6",  "d6b6",  "d6c6",  "d6e6",  "d6f6",
-    "d6g6",  "d6h6",  "d6b7",  "d6c7",  "d6d7",  "d6e7",  "d6f7",  "d6b8",
-    "d6c8",  "d6d8",  "d6e8",  "d6f8",  "e6e1",  "e6a2",  "e6e2",  "e6b3",
-    "e6e3",  "e6h3",  "e6c4",  "e6d4",  "e6e4",  "e6f4",  "e6g4",  "e6c5",
-    "e6d5",  "e6e5",  "e6f5",  "e6g5",  "e6a6",  "e6b6",  "e6c6",  "e6d6",
-    "e6f6",  "e6g6",  "e6h6",  "e6c7",  "e6d7",  "e6e7",  "e6f7",  "e6g7",
-    "e6c8",  "e6d8",  "e6e8",  "e6f8",  "e6g8",  "f6a1",  "f6f1",  "f6b2",
-    "f6f2",  "f6c3",  "f6f3",  "f6d4",  "f6e4",  "f6f4",  "f6g4",  "f6h4",
-    "f6d5",  "f6e5",  "f6f5",  "f6g5",  "f6h5",  "f6a6",  "f6b6",  "f6c6",
-    "f6d6",  "f6e6",  "f6g6",  "f6h6",  "f6d7",  "f6e7",  "f6f7",  "f6g7",
-    "f6h7",  "f6d8",  "f6e8",  "f6f8",  "f6g8",  "f6h8",  "g6b1",  "g6g1",
-    "g6c2",  "g6g2",  "g6d3",  "g6g3",  "g6e4",  "g6f4",  "g6g4",  "g6h4",
-    "g6e5",  "g6f5",  "g6g5",  "g6h5",  "g6a6",  "g6b6",  "g6c6",  "g6d6",
-    "g6e6",  "g6f6",  "g6h6",  "g6e7",  "g6f7",  "g6g7",  "g6h7",  "g6e8",
-    "g6f8",  "g6g8",  "g6h8",  "h6c1",  "h6h1",  "h6d2",  "h6h2",  "h6e3",
-    "h6h3",  "h6f4",  "h6g4",  "h6h4",  "h6f5",  "h6g5",  "h6h5",  "h6a6",
-    "h6b6",  "h6c6",  "h6d6",  "h6e6",  "h6f6",  "h6g6",  "h6f7",  "h6g7",
-    "h6h7",  "h6f8",  "h6g8",  "h6h8",  "a7a1",  "a7g1",  "a7a2",  "a7f2",
-    "a7a3",  "a7e3",  "a7a4",  "a7d4",  "a7a5",  "a7b5",  "a7c5",  "a7a6",
-    "a7b6",  "a7c6",  "a7b7",  "a7c7",  "a7d7",  "a7e7",  "a7f7",  "a7g7",
-    "a7h7",  "a7a8",  "a7b8",  "a7c8",  "b7b1",  "b7h1",  "b7b2",  "b7g2",
-    "b7b3",  "b7f3",  "b7b4",  "b7e4",  "b7a5",  "b7b5",  "b7c5",  "b7d5",
-    "b7a6",  "b7b6",  "b7c6",  "b7d6",  "b7a7",  "b7c7",  "b7d7",  "b7e7",
-    "b7f7",  "b7g7",  "b7h7",  "b7a8",  "b7b8",  "b7c8",  "b7d8",  "c7c1",
-    "c7c2",  "c7h2",  "c7c3",  "c7g3",  "c7c4",  "c7f4",  "c7a5",  "c7b5",
-    "c7c5",  "c7d5",  "c7e5",  "c7a6",  "c7b6",  "c7c6",  "c7d6",  "c7e6",
-    "c7a7",  "c7b7",  "c7d7",  "c7e7",  "c7f7",  "c7g7",  "c7h7",  "c7a8",
-    "c7b8",  "c7c8",  "c7d8",  "c7e8",  "d7d1",  "d7d2",  "d7d3",  "d7h3",
-    "d7a4",  "d7d4",  "d7g4",  "d7b5",  "d7c5",  "d7d5",  "d7e5",  "d7f5",
-    "d7b6",  "d7c6",  "d7d6",  "d7e6",  "d7f6",  "d7a7",  "d7b7",  "d7c7",
-    "d7e7",  "d7f7",  "d7g7",  "d7h7",  "d7b8",  "d7c8",  "d7d8",  "d7e8",
-    "d7f8",  "e7e1",  "e7e2",  "e7a3",  "e7e3",  "e7b4",  "e7e4",  "e7h4",
-    "e7c5",  "e7d5",  "e7e5",  "e7f5",  "e7g5",  "e7c6",  "e7d6",  "e7e6",
-    "e7f6",  "e7g6",  "e7a7",  "e7b7",  "e7c7",  "e7d7",  "e7f7",  "e7g7",
-    "e7h7",  "e7c8",  "e7d8",  "e7e8",  "e7f8",  "e7g8",  "f7f1",  "f7a2",
-    "f7f2",  "f7b3",  "f7f3",  "f7c4",  "f7f4",  "f7d5",  "f7e5",  "f7f5",
-    "f7g5",  "f7h5",  "f7d6",  "f7e6",  "f7f6",  "f7g6",  "f7h6",  "f7a7",
-    "f7b7",  "f7c7",  "f7d7",  "f7e7",  "f7g7",  "f7h7",  "f7d8",  "f7e8",
-    "f7f8",  "f7g8",  "f7h8",  "g7a1",  "g7g1",  "g7b2",  "g7g2",  "g7c3",
-    "g7g3",  "g7d4",  "g7g4",  "g7e5",  "g7f5",  "g7g5",  "g7h5",  "g7e6",
-    "g7f6",  "g7g6",  "g7h6",  "g7a7",  "g7b7",  "g7c7",  "g7d7",  "g7e7",
-    "g7f7",  "g7h7",  "g7e8",  "g7f8",  "g7g8",  "g7h8",  "h7b1",  "h7h1",
-    "h7c2",  "h7h2",  "h7d3",  "h7h3",  "h7e4",  "h7h4",  "h7f5",  "h7g5",
-    "h7h5",  "h7f6",  "h7g6",  "h7h6",  "h7a7",  "h7b7",  "h7c7",  "h7d7",
-    "h7e7",  "h7f7",  "h7g7",  "h7f8",  "h7g8",  "h7h8",  "a8a1",  "a8h1",
-    "a8a2",  "a8g2",  "a8a3",  "a8f3",  "a8a4",  "a8e4",  "a8a5",  "a8d5",
-    "a8a6",  "a8b6",  "a8c6",  "a8a7",  "a8b7",  "a8c7",  "a8b8",  "a8c8",
-    "a8d8",  "a8e8",  "a8f8",  "a8g8",  "a8h8",  "b8b1",  "b8b2",  "b8h2",
-    "b8b3",  "b8g3",  "b8b4",  "b8f4",  "b8b5",  "b8e5",  "b8a6",  "b8b6",
-    "b8c6",  "b8d6",  "b8a7",  "b8b7",  "b8c7",  "b8d7",  "b8a8",  "b8c8",
-    "b8d8",  "b8e8",  "b8f8",  "b8g8",  "b8h8",  "c8c1",  "c8c2",  "c8c3",
-    "c8h3",  "c8c4",  "c8g4",  "c8c5",  "c8f5",  "c8a6",  "c8b6",  "c8c6",
-    "c8d6",  "c8e6",  "c8a7",  "c8b7",  "c8c7",  "c8d7",  "c8e7",  "c8a8",
-    "c8b8",  "c8d8",  "c8e8",  "c8f8",  "c8g8",  "c8h8",  "d8d1",  "d8d2",
-    "d8d3",  "d8d4",  "d8h4",  "d8a5",  "d8d5",  "d8g5",  "d8b6",  "d8c6",
-    "d8d6",  "d8e6",  "d8f6",  "d8b7",  "d8c7",  "d8d7",  "d8e7",  "d8f7",
-    "d8a8",  "d8b8",  "d8c8",  "d8e8",  "d8f8",  "d8g8",  "d8h8",  "e8e1",
-    "e8e2",  "e8e3",  "e8a4",  "e8e4",  "e8b5",  "e8e5",  "e8h5",  "e8c6",
-    "e8d6",  "e8e6",  "e8f6",  "e8g6",  "e8c7",  "e8d7",  "e8e7",  "e8f7",
-    "e8g7",  "e8a8",  "e8b8",  "e8c8",  "e8d8",  "e8f8",  "e8g8",  "e8h8",
-    "f8f1",  "f8f2",  "f8a3",  "f8f3",  "f8b4",  "f8f4",  "f8c5",  "f8f5",
-    "f8d6",  "f8e6",  "f8f6",  "f8g6",  "f8h6",  "f8d7",  "f8e7",  "f8f7",
-    "f8g7",  "f8h7",  "f8a8",  "f8b8",  "f8c8",  "f8d8",  "f8e8",  "f8g8",
-    "f8h8",  "g8g1",  "g8a2",  "g8g2",  "g8b3",  "g8g3",  "g8c4",  "g8g4",
-    "g8d5",  "g8g5",  "g8e6",  "g8f6",  "g8g6",  "g8h6",  "g8e7",  "g8f7",
-    "g8g7",  "g8h7",  "g8a8",  "g8b8",  "g8c8",  "g8d8",  "g8e8",  "g8f8",
-    "g8h8",  "h8a1",  "h8h1",  "h8b2",  "h8h2",  "h8c3",  "h8h3",  "h8d4",
-    "h8h4",  "h8e5",  "h8h5",  "h8f6",  "h8g6",  "h8h6",  "h8f7",  "h8g7",
-    "h8h7",  "h8a8",  "h8b8",  "h8c8",  "h8d8",  "h8e8",  "h8f8",  "h8g8",
-    "a7a8q", "a7a8r", "a7a8b", "a7b8q", "a7b8r", "a7b8b", "b7a8q", "b7a8r",
-    "b7a8b", "b7b8q", "b7b8r", "b7b8b", "b7c8q", "b7c8r", "b7c8b", "c7b8q",
-    "c7b8r", "c7b8b", "c7c8q", "c7c8r", "c7c8b", "c7d8q", "c7d8r", "c7d8b",
-    "d7c8q", "d7c8r", "d7c8b", "d7d8q", "d7d8r", "d7d8b", "d7e8q", "d7e8r",
-    "d7e8b", "e7d8q", "e7d8r", "e7d8b", "e7e8q", "e7e8r", "e7e8b", "e7f8q",
-    "e7f8r", "e7f8b", "f7e8q", "f7e8r", "f7e8b", "f7f8q", "f7f8r", "f7f8b",
-    "f7g8q", "f7g8r", "f7g8b", "g7f8q", "g7f8r", "g7f8b", "g7g8q", "g7g8r",
-    "g7g8b", "g7h8q", "g7h8r", "g7h8b", "h7g8q", "h7g8r", "h7g8b", "h7h8q",
-    "h7h8r", "h7h8b"};
-
-std::vector<unsigned short> BuildMoveIndices() {
-  std::vector<unsigned short> res(4 * 64 * 64);
-  for (size_t i = 0; i < sizeof(kIdxToMove) / sizeof(kIdxToMove[0]); ++i) {
-    res[kIdxToMove[i].as_packed_int()] = i;
-  }
-  return res;
-}
-
-const std::vector<unsigned short> kMoveToIdx = BuildMoveIndices();
-const int kKingCastleIndex =
-    kMoveToIdx[BoardSquare("e1").as_int() * 64 + BoardSquare("h1").as_int()];
-const int kQueenCastleIndex =
-    kMoveToIdx[BoardSquare("e1").as_int() * 64 + BoardSquare("a1").as_int()];
-
-BoardSquare Transform(BoardSquare sq, int transform) {
-  if ((transform & FlipTransform) != 0) {
-    sq.set(sq.row(), 7 - sq.col());
-  }
-  if ((transform & MirrorTransform) != 0) {
-    sq.set(7 - sq.row(), sq.col());
-  }
-  if ((transform & TransposeTransform) != 0) {
-    sq.set(7 - sq.col(), 7 - sq.row());
-  }
-  return sq;
-}
-}  // namespace
-
-Move::Move(const std::string& str, bool black) {
-  if (str.size() < 4) throw Exception("Bad move: " + str);
-  SetFrom(BoardSquare(str.substr(0, 2), black));
-  SetTo(BoardSquare(str.substr(2, 2), black));
-  if (str.size() != 4) {
-    if (str.size() != 5) throw Exception("Bad move: " + str);
-    switch (str[4]) {
-      case 'q':
-      case 'Q':
-        SetPromotion(Promotion::Queen);
-        break;
-      case 'r':
-      case 'R':
-        SetPromotion(Promotion::Rook);
-        break;
-      case 'b':
-      case 'B':
-        SetPromotion(Promotion::Bishop);
-        break;
-      case 'n':
-      case 'N':
-        SetPromotion(Promotion::Knight);
-        break;
-      default:
-        throw Exception("Bad move: " + str);
-    }
-  }
-}
-
-uint16_t Move::as_packed_int() const {
-  if (promotion() == Promotion::Knight) {
-    return from().as_int() * 64 + to().as_int();
-  } else {
-    return static_cast<int>(promotion()) * 64 * 64 + from().as_int() * 64 +
-           to().as_int();
-  }
-}
-
-uint16_t Move::as_nn_index(int transform) const {
-  if (transform == 0) {
-    return kMoveToIdx[as_packed_int()];
-  }
-  Move transformed = *this;
-  transformed.SetTo(Transform(to(), transform));
-  transformed.SetFrom(Transform(from(), transform));
-  return transformed.as_nn_index(0);
-}
-
-Move MoveFromNNIndex(int idx, int transform) {
-  Move m = kIdxToMove[idx];
-  if (transform == 0) {
-    return m;
-  }
-  int inv_transform;
-  if (transform & TransposeTransform) {
-    inv_transform = TransposeTransform;
-    if (transform & FlipTransform) inv_transform |= MirrorTransform;
-    if (transform & MirrorTransform) inv_transform |= FlipTransform;
-  } else {
-    inv_transform = transform;
-  }
-  m.SetTo(Transform(m.to(), inv_transform));
-  m.SetFrom(Transform(m.from(), inv_transform));
-  return m;
-}
-
-}  // namespace lczero
diff --git a/src/chess/bitboard.h b/src/chess/bitboard.h
index b6e6394d0f..e01b87dd93 100644
--- a/src/chess/bitboard.h
+++ b/src/chess/bitboard.h
@@ -32,59 +32,11 @@
 #include <string>
 #include <vector>
 
+#include "chess/types.h"
 #include "utils/bititer.h"
 
 namespace lczero {
 
-// Stores a coordinates of a single square.
-class BoardSquare {
- public:
-  constexpr BoardSquare() {}
-  // As a single number, 0 to 63, bottom to top, left to right.
-  // 0 is a1, 8 is a2, 63 is h8.
-  constexpr BoardSquare(std::uint8_t num) : square_(num) {}
-  // From row(bottom to top), and col(left to right), 0-based.
-  constexpr BoardSquare(int row, int col) : BoardSquare(row * 8 + col) {}
-  // From Square name, e.g e4. Only lowercase.
-  BoardSquare(const std::string& str, bool black = false)
-      : BoardSquare(black ? '8' - str[1] : str[1] - '1', str[0] - 'a') {}
-  constexpr std::uint8_t as_int() const { return square_; }
-  constexpr std::uint64_t as_board() const { return 1ULL << square_; }
-  void set(int row, int col) { square_ = row * 8 + col; }
-
-  // 0-based, bottom to top.
-  int row() const { return square_ / 8; }
-  // 0-based, left to right.
-  int col() const { return square_ % 8; }
-
-  // Row := 7 - row.  Col remains the same.
-  void Mirror() { square_ = square_ ^ 0b111000; }
-
-  // Checks whether coordinate is within 0..7.
-  static bool IsValidCoord(int x) { return x >= 0 && x < 8; }
-
-  // Checks whether coordinates are within 0..7.
-  static bool IsValid(int row, int col) {
-    return IsValidCoord(row) && IsValidCoord(col);
-  }
-
-  constexpr bool operator==(const BoardSquare& other) const {
-    return square_ == other.square_;
-  }
-
-  constexpr bool operator!=(const BoardSquare& other) const {
-    return square_ != other.square_;
-  }
-
-  // Returns the square in algebraic notation (e.g. "e4").
-  std::string as_string() const {
-    return std::string(1, 'a' + col()) + std::string(1, '1' + row());
-  }
-
- private:
-  std::uint8_t square_ = 0;  // Only lower six bits should be set.
-};
-
 // Represents a board as an array of 64 bits.
 // Bit enumeration goes from bottom to top, from left to right:
 // Square a1 is bit 0, square h1 is bit 7, square a2 is bit 8.
@@ -92,8 +44,9 @@ class BitBoard {
  public:
   constexpr BitBoard(std::uint64_t board) : board_(board) {}
   BitBoard() = default;
-  BitBoard(const BitBoard&) = default;
-  BitBoard& operator=(const BitBoard&) = default;
+  constexpr static BitBoard FromSquare(Square square) {
+    return BitBoard(1ULL << square.as_idx());
+  }
 
   std::uint64_t as_int() const { return board_; }
   void clear() { board_ = 0; }
@@ -134,30 +87,15 @@ class BitBoard {
 
   // Sets the value for given square to 1 if cond is true.
   // Otherwise does nothing (doesn't reset!).
-  void set_if(BoardSquare square, bool cond) { set_if(square.as_int(), cond); }
-  void set_if(std::uint8_t pos, bool cond) {
-    board_ |= (std::uint64_t(cond) << pos);
-  }
-  void set_if(int row, int col, bool cond) {
-    set_if(BoardSquare(row, col), cond);
+  void set_if(Square square, bool cond) {
+    board_ |= (static_cast<uint64_t>(cond) << square.as_idx());
   }
-
   // Sets value of given square to 1.
-  void set(BoardSquare square) { set(square.as_int()); }
-  void set(std::uint8_t pos) { board_ |= (std::uint64_t(1) << pos); }
-  void set(int row, int col) { set(BoardSquare(row, col)); }
-
+  void set(Square square) { board_ |= (1ULL << square.as_idx()); }
   // Sets value of given square to 0.
-  void reset(BoardSquare square) { reset(square.as_int()); }
-  void reset(std::uint8_t pos) { board_ &= ~(std::uint64_t(1) << pos); }
-  void reset(int row, int col) { reset(BoardSquare(row, col)); }
-
+  void reset(Square square) { board_ &= ~(1ULL << square.as_idx()); }
   // Gets value of a square.
-  bool get(BoardSquare square) const { return get(square.as_int()); }
-  bool get(std::uint8_t pos) const {
-    return board_ & (std::uint64_t(1) << pos);
-  }
-  bool get(int row, int col) const { return get(BoardSquare(row, col)); }
+  bool get(Square square) const { return board_ & (1ULL << square.as_idx()); }
 
   // Returns whether all bits of a board are set to 0.
   bool empty() const { return board_ == 0; }
@@ -168,26 +106,21 @@ class BitBoard {
   // Flips black and white side of a board.
   void Mirror() { board_ = ReverseBytesInBytes(board_); }
 
-  bool operator==(const BitBoard& other) const {
-    return board_ == other.board_;
-  }
+  bool operator==(const BitBoard& other) const = default;
+  bool operator!=(const BitBoard& other) const = default;
 
-  bool operator!=(const BitBoard& other) const {
-    return board_ != other.board_;
-  }
-
-  BitIterator<BoardSquare> begin() const { return board_; }
-  BitIterator<BoardSquare> end() const { return 0; }
+  struct Uin64ToSquare {
+    constexpr Square operator()(uint64_t x) { return Square::FromIdx(x); }
+  };
+  using Iterator = BitIterator<Square, Uin64ToSquare>;
+  Iterator begin() const { return board_; }
+  Iterator end() const { return 0; }
 
   std::string DebugString() const {
     std::string res;
     for (int i = 7; i >= 0; --i) {
-      for (int j = 0; j < 8; ++j) {
-        if (get(i, j))
-          res += '#';
-        else
-          res += '.';
-      }
+      for (int j = 0; j < 8; ++j)
+        res += get({File::FromIdx(i), Rank::FromIdx(j)}) ? '#' : '.';
       res += '\n';
     }
     return res;
@@ -215,8 +148,8 @@ class BitBoard {
   }
 
   // Returns bitboard with one bit reset.
-  friend BitBoard operator-(const BitBoard& a, const BoardSquare& b) {
-    return {a.board_ & ~b.as_board()};
+  friend BitBoard operator-(const BitBoard& a, const Square& b) {
+    return {a.board_ & ~(1ULL << b.as_idx())};
   }
 
   // Returns difference (bitwise AND-NOT) of two boards.
@@ -228,77 +161,4 @@ class BitBoard {
   std::uint64_t board_ = 0;
 };
 
-class Move {
- public:
-  enum class Promotion : std::uint8_t { None, Queen, Rook, Bishop, Knight };
-  Move() = default;
-  constexpr Move(BoardSquare from, BoardSquare to)
-      : data_(to.as_int() + (from.as_int() << 6)) {}
-  constexpr Move(BoardSquare from, BoardSquare to, Promotion promotion)
-      : data_(to.as_int() + (from.as_int() << 6) +
-              (static_cast<uint8_t>(promotion) << 12)) {}
-  Move(const std::string& str, bool black = false);
-  Move(const char* str, bool black = false) : Move(std::string(str), black) {}
-
-  BoardSquare to() const { return BoardSquare(data_ & kToMask); }
-  BoardSquare from() const { return BoardSquare((data_ & kFromMask) >> 6); }
-  Promotion promotion() const { return Promotion((data_ & kPromoMask) >> 12); }
-
-  void SetTo(BoardSquare to) { data_ = (data_ & ~kToMask) | to.as_int(); }
-  void SetFrom(BoardSquare from) {
-    data_ = (data_ & ~kFromMask) | (from.as_int() << 6);
-  }
-  void SetPromotion(Promotion promotion) {
-    data_ = (data_ & ~kPromoMask) | (static_cast<uint8_t>(promotion) << 12);
-  }
-  // 0 .. 16384, knight promotion and no promotion is the same.
-  uint16_t as_packed_int() const;
-
-  // 0 .. 1857, to use in neural networks.
-  // Transform is a bit field which describes a transform to be applied to the
-  // the move before converting it to an index.
-  uint16_t as_nn_index(int transform) const;
-
-  explicit operator bool() const { return data_ != 0; }
-  bool operator==(const Move& other) const { return data_ == other.data_; }
-
-  void Mirror() { data_ ^= 0b111000111000; }
-
-  std::string as_string() const {
-    std::string res = from().as_string() + to().as_string();
-    switch (promotion()) {
-      case Promotion::None:
-        return res;
-      case Promotion::Queen:
-        return res + 'q';
-      case Promotion::Rook:
-        return res + 'r';
-      case Promotion::Bishop:
-        return res + 'b';
-      case Promotion::Knight:
-        return res + 'n';
-    }
-    assert(false);
-    return "Error!";
-  }
-
- private:
-  uint16_t data_ = 0;
-  // Move, using the following encoding:
-  // bits 0..5 "to"-square
-  // bits 6..11 "from"-square
-  // bits 12..14 promotion value
-
-  enum Masks : uint16_t {
-    kToMask = 0b0000000000111111,
-    kFromMask = 0b0000111111000000,
-    kPromoMask = 0b0111000000000000,
-  };
-};
-
-using MoveList = std::vector<Move>;
-
-// Gets the move from the NN move index, undoing the given transform.
-Move MoveFromNNIndex(int idx, int transform);
-
 }  // namespace lczero
diff --git a/src/chess/board.cc b/src/chess/board.cc
index 083b6b13ad..59bc0c39cd 100644
--- a/src/chess/board.cc
+++ b/src/chess/board.cc
@@ -29,9 +29,12 @@
 
 #include <algorithm>
 #include <cctype>
+#include <charconv>
 #include <cstdlib>
 #include <cstring>
 #include <sstream>
+#include <utility>
+#include <absl/cleanup/cleanup.h>
 
 #include "utils/exception.h"
 
@@ -49,9 +52,7 @@ const ChessBoard ChessBoard::kStartposBoard(ChessBoard::kStartposFen);
 
 const BitBoard ChessBoard::kPawnMask = 0x00FFFFFFFFFFFF00ULL;
 
-void ChessBoard::Clear() {
-  *this = ChessBoard();
-}
+void ChessBoard::Clear() { *this = ChessBoard(); }
 
 void ChessBoard::Mirror() {
   our_pieces_.Mirror();
@@ -60,8 +61,8 @@ void ChessBoard::Mirror() {
   rooks_.Mirror();
   bishops_.Mirror();
   pawns_.Mirror();
-  our_king_.Mirror();
-  their_king_.Mirror();
+  our_king_.Flip();
+  their_king_.Flip();
   std::swap(our_king_, their_king_);
   castlings_.Mirror();
   flipped_ = !flipped_;
@@ -174,12 +175,7 @@ static const BitBoard kPawnAttacks[] = {
     0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
     0x0000000000000000ULL};
 
-static const Move::Promotion kPromotions[] = {
-    Move::Promotion::Queen,
-    Move::Promotion::Rook,
-    Move::Promotion::Bishop,
-    Move::Promotion::Knight,
-};
+static constexpr PieceType kPromotions[] = {kQueen, kRook, kBishop, kKnight};
 
 // Magic bitboard routines and structures.
 // We use so-called "fancy" magic bitboards.
@@ -258,6 +254,11 @@ static MagicParams bishop_magic_params[64];
 static BitBoard rook_attacks_table[102400];
 static BitBoard bishop_attacks_table[5248];
 
+namespace {
+constexpr bool IsOnBoard(int x) { return x >= 0 && x < 8; }
+constexpr bool IsOnBoard(int x, int y) { return IsOnBoard(x) && IsOnBoard(y); }
+}  // namespace
+
 // Builds rook or bishop attacks table.
 static void BuildAttacksTable(MagicParams* magic_params,
                               BitBoard* attacks_table,
@@ -267,24 +268,24 @@ static void BuildAttacksTable(MagicParams* magic_params,
 
   // Initialize for all board squares.
   for (unsigned square = 0; square < 64; square++) {
-    const BoardSquare b_sq(square);
+    const Square b_sq = Square::FromIdx(square);
 
     // Calculate relevant occupancy masks.
     BitBoard mask = {0};
 
     for (int j = 0; j < 4; j++) {
       auto direction = directions[j];
-      auto dst_row = b_sq.row();
-      auto dst_col = b_sq.col();
+      auto dst_row = b_sq.rank().idx;
+      auto dst_col = b_sq.file().idx;
       while (true) {
         dst_row += direction.first;
         dst_col += direction.second;
         // If the next square in this direction is invalid, the current square
         // is at the board's edge and should not be added.
-        if (!BoardSquare::IsValid(dst_row + direction.first,
-                                  dst_col + direction.second))
+        if (!IsOnBoard(dst_row + direction.first, dst_col + direction.second))
           break;
-        const BoardSquare destination(dst_row, dst_col);
+        const Square destination(File::FromIdx(dst_col),
+                                 Rank::FromIdx(dst_row));
         mask.set(destination);
       }
     }
@@ -293,7 +294,7 @@ static void BuildAttacksTable(MagicParams* magic_params,
     magic_params[square].mask_ = mask.as_int();
 
     // Cache relevant occupancy board squares.
-    std::vector<BoardSquare> occupancy_squares;
+    std::vector<Square> occupancy_squares;
 
     for (auto occ_sq : BitBoard(magic_params[square].mask_)) {
       occupancy_squares.emplace_back(occ_sq);
@@ -327,13 +328,14 @@ static void BuildAttacksTable(MagicParams* magic_params,
 
       for (int j = 0; j < 4; j++) {
         auto direction = directions[j];
-        auto dst_row = b_sq.row();
-        auto dst_col = b_sq.col();
+        auto dst_row = b_sq.rank().idx;
+        auto dst_col = b_sq.file().idx;
         while (true) {
           dst_row += direction.first;
           dst_col += direction.second;
-          if (!BoardSquare::IsValid(dst_row, dst_col)) break;
-          const BoardSquare destination(dst_row, dst_col);
+          if (!IsOnBoard(dst_row, dst_col)) break;
+          const Square destination(File::FromIdx(dst_col),
+                                   Rank::FromIdx(dst_row));
           attacks.set(destination);
           if (occupancy.get(destination)) break;
         }
@@ -369,10 +371,10 @@ static void BuildAttacksTable(MagicParams* magic_params,
 
 // Returns the rook attacks bitboard for the given rook board square and the
 // given occupied piece bitboard.
-static inline BitBoard GetRookAttacks(const BoardSquare rook_square,
+static inline BitBoard GetRookAttacks(const Square rook_square,
                                       const BitBoard pieces) {
   // Calculate magic index.
-  const uint8_t square = rook_square.as_int();
+  const uint8_t square = rook_square.as_idx();
 
 #if defined(NO_PEXT)
   uint64_t index = pieces.as_int() & rook_magic_params[square].mask_;
@@ -388,10 +390,10 @@ static inline BitBoard GetRookAttacks(const BoardSquare rook_square,
 
 // Returns the bishop attacks bitboard for the given bishop board square and
 // the given occupied piece bitboard.
-static inline BitBoard GetBishopAttacks(const BoardSquare bishop_square,
+static inline BitBoard GetBishopAttacks(const Square bishop_square,
                                         const BitBoard pieces) {
   // Calculate magic index.
-  const uint8_t square = bishop_square.as_int();
+  const uint8_t square = bishop_square.as_idx();
 
 #if defined(NO_PEXT)
   uint64_t index = pieces.as_int() & bishop_magic_params[square].mask_;
@@ -432,50 +434,53 @@ MoveList ChessBoard::GeneratePseudolegalMoves() const {
     // King
     if (source == our_king_) {
       for (const auto& delta : kKingMoves) {
-        const auto dst_row = source.row() + delta.first;
-        const auto dst_col = source.col() + delta.second;
-        if (!BoardSquare::IsValid(dst_row, dst_col)) continue;
-        const BoardSquare destination(dst_row, dst_col);
+        const Rank dst_rank = source.rank() + delta.first;
+        if (!dst_rank.IsValid()) continue;
+        const File dst_file = source.file() + delta.second;
+        if (!dst_file.IsValid()) continue;
+        const Square destination(dst_file, dst_rank);
         if (our_pieces_.get(destination)) continue;
         if (IsUnderAttack(destination)) continue;
-        result.emplace_back(source, destination);
+        result.emplace_back(Move::White(source, destination));
       }
       // Castlings.
-      auto walk_free = [this](int from, int to, int rook, int king) {
-        for (int i = from; i <= to; ++i) {
+      auto walk_free = [this](File from, File to, File rook, File king) {
+        for (File i = from; i <= to; ++i) {
           if (i == rook || i == king) continue;
-          if (our_pieces_.get(i) || their_pieces_.get(i)) return false;
+          if (our_pieces_.get({i, kRank1}) || their_pieces_.get({i, kRank1})) {
+            return false;
+          }
         }
         return true;
       };
       // @From may be less or greater than @to. @To is not included in check
       // unless it is the same with @from.
-      auto range_attacked = [this](int from, int to) {
-        if (from == to) return IsUnderAttack(from);
+      auto range_attacked = [this](File from, File to) {
+        if (from == to) return IsUnderAttack(Square(from, kRank1));
         const int increment = from < to ? 1 : -1;
         while (from != to) {
-          if (IsUnderAttack(from)) return true;
+          if (IsUnderAttack(Square(from, kRank1))) return true;
           from += increment;
         }
         return false;
       };
-      const uint8_t king = source.col();
+      const File king = source.file();
       // For castlings we don't check destination king square for checks, it
       // will be done in legal move check phase.
       if (castlings_.we_can_000()) {
-        const uint8_t qrook = castlings_.our_queenside_rook();
-        if (walk_free(std::min(static_cast<uint8_t>(C1), qrook),
-                      std::max(static_cast<uint8_t>(D1), king), qrook, king) &&
-            !range_attacked(king, C1)) {
-          result.emplace_back(source, BoardSquare(RANK_1, qrook));
+        const File qrook = castlings_.our_queenside_rook;
+        if (walk_free(std::min(kFileC, qrook), std::max(kFileD, king), qrook,
+                      king) &&
+            !range_attacked(king, kFileC)) {
+          result.emplace_back(Move::WhiteCastling(king, qrook));
         }
       }
       if (castlings_.we_can_00()) {
-        const uint8_t krook = castlings_.our_kingside_rook();
-        if (walk_free(std::min(static_cast<uint8_t>(F1), king),
-                      std::max(static_cast<uint8_t>(G1), krook), krook, king) &&
-            !range_attacked(king, G1)) {
-          result.emplace_back(source, BoardSquare(RANK_1, krook));
+        const File krook = castlings_.our_kingside_rook;
+        if (walk_free(std::min(kFileF, king), std::max(kFileG, krook), krook,
+                      king) &&
+            !range_attacked(king, kFileG)) {
+          result.emplace_back(Move::WhiteCastling(king, krook));
         }
       }
       continue;
@@ -488,7 +493,7 @@ MoveList ChessBoard::GeneratePseudolegalMoves() const {
           GetRookAttacks(source, our_pieces_ | their_pieces_) - our_pieces_;
 
       for (const auto& destination : attacked) {
-        result.emplace_back(source, destination);
+        result.emplace_back(Move::White(source, destination));
       }
     }
     // Bishop (and queen)
@@ -498,7 +503,7 @@ MoveList ChessBoard::GeneratePseudolegalMoves() const {
           GetBishopAttacks(source, our_pieces_ | their_pieces_) - our_pieces_;
 
       for (const auto& destination : attacked) {
-        result.emplace_back(source, destination);
+        result.emplace_back(Move::White(source, destination));
       }
     }
     if (processed_piece) continue;
@@ -506,24 +511,25 @@ MoveList ChessBoard::GeneratePseudolegalMoves() const {
     if ((pawns_ & kPawnMask).get(source)) {
       // Moves forward.
       {
-        const auto dst_row = source.row() + 1;
-        const auto dst_col = source.col();
-        const BoardSquare destination(dst_row, dst_col);
+        const Rank dst_rank = source.rank() + 1;
+        const File dst_file = source.file();
+        const Square destination(dst_file, dst_rank);
 
         if (!our_pieces_.get(destination) && !their_pieces_.get(destination)) {
-          if (dst_row != RANK_8) {
-            result.emplace_back(source, destination);
-            if (dst_row == RANK_3) {
+          if (dst_rank != kRank8) {
+            result.emplace_back(Move::White(source, destination));
+            if (dst_rank == kRank3) {
               // Maybe it'll be possible to move two squares.
-              if (!our_pieces_.get(RANK_4, dst_col) &&
-                  !their_pieces_.get(RANK_4, dst_col)) {
-                result.emplace_back(source, BoardSquare(RANK_4, dst_col));
+              const Square jump_dst(dst_file, kRank4);
+              if (!our_pieces_.get(jump_dst) && !their_pieces_.get(jump_dst)) {
+                result.emplace_back(Move::White(source, jump_dst));
               }
             }
           } else {
             // Promotions
             for (auto promotion : kPromotions) {
-              result.emplace_back(source, destination, promotion);
+              result.emplace_back(
+                  Move::WhitePromotion(source, destination, promotion));
             }
           }
         }
@@ -531,25 +537,27 @@ MoveList ChessBoard::GeneratePseudolegalMoves() const {
       // Captures.
       {
         for (auto direction : {-1, 1}) {
-          const auto dst_row = source.row() + 1;
-          const auto dst_col = source.col() + direction;
-          if (dst_col < 0 || dst_col >= 8) continue;
-          const BoardSquare destination(dst_row, dst_col);
+          const auto dst_rank = source.rank() + 1;
+          const auto dst_file = source.file() + direction;
+          if (!dst_file.IsValid()) continue;
+          const Square destination(dst_file, dst_rank);
           if (their_pieces_.get(destination)) {
-            if (dst_row == RANK_8) {
+            if (dst_rank == kRank8) {
               // Promotion.
               for (auto promotion : kPromotions) {
-                result.emplace_back(source, destination, promotion);
+                result.emplace_back(
+                    Move::WhitePromotion(source, destination, promotion));
               }
             } else {
               // Ordinary capture.
-              result.emplace_back(source, destination);
+              result.emplace_back(Move::White(source, destination));
             }
-          } else if (dst_row == RANK_6 && pawns_.get(RANK_8, dst_col)) {
+          } else if (dst_rank == kRank6 &&
+                     pawns_.get(Square(dst_file, kRank8))) {
             // En passant.
             // "Pawn" on opponent's file 8 means that en passant is possible.
             // Those fake pawns are reset in ApplyMove.
-            result.emplace_back(source, destination);
+            result.emplace_back(Move::WhiteEnPassant(source, destination));
           }
         }
       }
@@ -558,58 +566,77 @@ MoveList ChessBoard::GeneratePseudolegalMoves() const {
     // Knight.
     {
       for (const auto destination :
-           kKnightAttacks[source.as_int()] - our_pieces_) {
-        result.emplace_back(source, destination);
+           kKnightAttacks[source.as_idx()] - our_pieces_) {
+        result.emplace_back(Move::White(source, destination));
       }
     }
   }
   return result;
 }  // namespace lczero
 
+bool ChessBoard::IsValid() const {
+  const auto all = ours() | theirs();
+  auto check = all | pawns() | bishops() | rooks() | queens() | kings();
+  if (check != all ||
+      (pawns() & bishops()).as_int() ||
+      (pawns() & rooks()).as_int() ||
+      (pawns() & queens()).as_int() ||
+      (pawns() & kings()).as_int() ||
+      (bishops() & rooks()).as_int() ||
+      (bishops() & queens()).as_int() ||
+      (bishops() & kings()).as_int() ||
+      (rooks() & queens()).as_int() ||
+      (rooks() & kings()).as_int() ||
+      (queens() & kings()).as_int()) {
+    return false;
+  }
+  return true;
+}
+
 bool ChessBoard::ApplyMove(Move move) {
-  const auto& from = move.from();
-  const auto& to = move.to();
-  const auto from_row = from.row();
-  const auto from_col = from.col();
-  const auto to_row = to.row();
-  const auto to_col = to.col();
+  assert(our_pieces_.intersects(BitBoard::FromSquare(move.from())));
+#ifndef NDEBUG
+  absl::Cleanup validate = [&] {
+    if (!IsValid()) {
+      CERR << "Move " + move.ToString(true) +
+                  " resulted in invalid board: " + DebugString();
+      assert(false);
+    }
+  };
+#endif
+  const Square& from = move.from();
+  const Square& to = move.to();
+  const Rank from_rank = from.rank();
+  const File from_file = from.file();
+  const Rank to_rank = to.rank();
+  const File to_file = to.file();
 
   // Castlings.
   if (from == our_king_) {
     castlings_.reset_we_can_00();
     castlings_.reset_we_can_000();
-    auto do_castling = [this](int king_dst, int rook_src, int rook_dst) {
+    auto do_castling = [this](File king_dst, Square rook_src, File rook_dst) {
       // Remove en passant flags.
       pawns_ &= kPawnMask;
       our_pieces_.reset(our_king_);
       our_pieces_.reset(rook_src);
       rooks_.reset(rook_src);
-      our_pieces_.set(king_dst);
-      our_pieces_.set(rook_dst);
-      rooks_.set(rook_dst);
-      our_king_ = king_dst;
+      our_king_ = Square(king_dst, kRank1);
+      our_pieces_.set(our_king_);
+      Square rook_dst_sq(rook_dst, kRank1);
+      our_pieces_.set(rook_dst_sq);
+      rooks_.set(rook_dst_sq);
     };
-    if (from_row == RANK_1 && to_row == RANK_1) {
-      const auto our_rooks = rooks() & our_pieces_;
-      if (our_rooks.get(to)) {
-        // Castling.
-        if (to_col > from_col) {
-          // Kingside.
-          do_castling(G1, to.as_int(), F1);
-        } else {
-          // Queenside.
-          do_castling(C1, to.as_int(), D1);
-        }
-        return false;
-      } else if (from_col == FILE_E && to_col == FILE_G) {
-        // Non FRC-style e1g1 castling (as opposed to e1h1).
-        do_castling(G1, H1, F1);
-        return false;
-      } else if (from_col == FILE_E && to_col == FILE_C) {
-        // Non FRC-style e1c1 castling (as opposed to e1a1).
-        do_castling(C1, A1, D1);
-        return false;
+    if (move.is_castling()) {
+      // Castling.
+      if (to_file > from_file) {
+        // Kingside.
+        do_castling(kFileG, to, kFileF);
+      } else {
+        // Queenside.
+        do_castling(kFileC, to, kFileD);
       }
+      return false;
     }
   }
 
@@ -619,22 +646,24 @@ bool ChessBoard::ApplyMove(Move move) {
 
   // Remove captured piece.
   bool reset_50_moves = their_pieces_.get(to);
-  their_pieces_.reset(to);
-  rooks_.reset(to);
-  bishops_.reset(to);
-  pawns_.reset(to);
-  if (to.as_int() == A8 + castlings_.their_kingside_rook()) {
-    castlings_.reset_they_can_00();
-  }
-  if (to.as_int() == A8 + castlings_.their_queenside_rook()) {
-    castlings_.reset_they_can_000();
+  if (reset_50_moves) {
+    their_pieces_.reset(to);
+    rooks_.reset(to);
+    bishops_.reset(to);
+    pawns_.reset(to);
+    if (to == Square(castlings_.their_kingside_rook, kRank8)) {
+      castlings_.reset_they_can_00();
+    }
+    if (to == Square(castlings_.their_queenside_rook, kRank8)) {
+      castlings_.reset_they_can_000();
+    }
   }
 
   // En passant.
-  if (from_row == RANK_5 && pawns_.get(from) && from_col != to_col &&
-      pawns_.get(RANK_8, to_col)) {
-    pawns_.reset(RANK_5, to_col);
-    their_pieces_.reset(RANK_5, to_col);
+  if (move.is_en_passant()) {
+    const Square ep_pawn(to_file, kRank5);
+    pawns_.reset(ep_pawn);
+    their_pieces_.reset(ep_pawn);
   }
 
   // Remove en passant flags.
@@ -650,15 +679,15 @@ bool ChessBoard::ApplyMove(Move move) {
   }
 
   // Promotion.
-  if (to_row == RANK_8 && pawns_.get(from)) {
-    switch (move.promotion()) {
-      case Move::Promotion::Rook:
+  if (move.is_promotion()) {
+    switch (move.promotion().idx) {
+      case kRook.idx:
         rooks_.set(to);
         break;
-      case Move::Promotion::Bishop:
+      case kBishop.idx:
         bishops_.set(to);
         break;
-      case Move::Promotion::Queen:
+      case kQueen.idx:
         rooks_.set(to);
         bishops_.set(to);
         break;
@@ -669,11 +698,11 @@ bool ChessBoard::ApplyMove(Move move) {
   }
 
   // Reset castling rights.
-  if (from_row == RANK_1 && rooks_.get(from)) {
-    if (from_col == castlings_.our_queenside_rook()) {
+  if (from_rank == kRank1 && rooks_.get(from)) {
+    if (from_file == castlings_.our_queenside_rook) {
       castlings_.reset_we_can_000();
     }
-    if (from_col == castlings_.our_kingside_rook()) {
+    if (from_file == castlings_.our_kingside_rook) {
       castlings_.reset_we_can_00();
     }
   }
@@ -687,23 +716,23 @@ bool ChessBoard::ApplyMove(Move move) {
   pawns_.reset(from);
 
   // Set en passant flag.
-  if (to_row - from_row == 2 && pawns_.get(to)) {
-    BoardSquare ep_sq(to_row - 1, to_col);
-    if (kPawnAttacks[ep_sq.as_int()].intersects(their_pieces_ & pawns_)) {
-      pawns_.set(0, to_col);
+  if (to_rank - from_rank == 2 && pawns_.get(to)) {
+    Square ep_sq(to_file, to_rank - 1);
+    if (kPawnAttacks[ep_sq.as_idx()].intersects(their_pieces_ & pawns_)) {
+      pawns_.set(Square(to_file, kRank1));
     }
   }
   return reset_50_moves;
 }
 
-bool ChessBoard::IsUnderAttack(BoardSquare square) const {
-  const int row = square.row();
-  const int col = square.col();
+bool ChessBoard::IsUnderAttack(Square square) const {
+  const Rank rank = square.rank();
+  const File file = square.file();
   // Check king.
   {
-    const int krow = their_king_.row();
-    const int kcol = their_king_.col();
-    if (std::abs(krow - row) <= 1 && std::abs(kcol - col) <= 1) return true;
+    const Rank krank = their_king_.rank();
+    const File kfile = their_king_.file();
+    if (std::abs(krank - rank) <= 1 && std::abs(kfile - file) <= 1) return true;
   }
   // Check rooks (and queens).
   if (GetRookAttacks(square, our_pieces_ | their_pieces_)
@@ -716,12 +745,12 @@ bool ChessBoard::IsUnderAttack(BoardSquare square) const {
     return true;
   }
   // Check pawns.
-  if (kPawnAttacks[square.as_int()].intersects(their_pieces_ & pawns_)) {
+  if (kPawnAttacks[square.as_idx()].intersects(their_pieces_ & pawns_)) {
     return true;
   }
   // Check knights.
   {
-    if (kKnightAttacks[square.as_int()].intersects(their_pieces_ - their_king_ -
+    if (kKnightAttacks[square.as_idx()].intersects(their_pieces_ - their_king_ -
                                                    rooks_ - bishops_ -
                                                    (pawns_ & kPawnMask))) {
       return true;
@@ -730,60 +759,29 @@ bool ChessBoard::IsUnderAttack(BoardSquare square) const {
   return false;
 }
 
-bool ChessBoard::IsSameMove(Move move1, Move move2) const {
-  // If moves are equal, it's the same move.
-  if (move1 == move2) return true;
-  // Explicitly check all legacy castling moves. Need to check for king, for
-  // e.g. rook e1a1 and e1c1 are different moves.
-  if (move1.from() != move2.from() || move1.from() != E1 ||
-      our_king_ != move1.from()) {
-    return false;
-  }
-  if (move1.to() == A1 && move2.to() == C1) return true;
-  if (move1.to() == C1 && move2.to() == A1) return true;
-  if (move1.to() == G1 && move2.to() == H1) return true;
-  if (move1.to() == H1 && move2.to() == G1) return true;
-  return false;
-}
-
-Move ChessBoard::GetLegacyMove(Move move) const {
-  if (our_king_ != move.from() || !our_pieces_.get(move.to())) {
-    return move;
-  }
-  if (move == Move(E1, H1)) return Move(E1, G1);
-  if (move == Move(E1, A1)) return Move(E1, C1);
-  return move;
-}
-
-Move ChessBoard::GetModernMove(Move move) const {
-  if (our_king_ != E1 || move.from() != E1) return move;
-  if (move == Move(E1, G1) && !our_pieces_.get(G1)) return Move(E1, H1);
-  if (move == Move(E1, C1) && !our_pieces_.get(C1)) return Move(E1, A1);
-  return move;
-}
-
 KingAttackInfo ChessBoard::GenerateKingAttackInfo() const {
   KingAttackInfo king_attack_info;
 
   // Number of attackers that give check (used for double check detection).
   unsigned num_king_attackers = 0;
 
-  const int row = our_king_.row();
-  const int col = our_king_.col();
+  const int row = our_king_.rank().idx;
+  const int col = our_king_.file().idx;
   // King checks are unnecessary, as kings cannot give check.
   // Check rooks (and queens).
-  if (kRookAttacks[our_king_.as_int()].intersects(their_pieces_ & rooks_)) {
+  if (kRookAttacks[our_king_.as_idx()].intersects(their_pieces_ & rooks_)) {
     for (const auto& direction : kRookDirections) {
       auto dst_row = row;
       auto dst_col = col;
       BitBoard attack_line(0);
       bool possible_pinned_piece_found = false;
-      BoardSquare possible_pinned_piece;
+      Square possible_pinned_piece;
       while (true) {
         dst_row += direction.first;
         dst_col += direction.second;
-        if (!BoardSquare::IsValid(dst_row, dst_col)) break;
-        const BoardSquare destination(dst_row, dst_col);
+        if (!IsOnBoard(dst_row, dst_col)) break;
+        const Square destination(File::FromIdx(dst_col),
+                                 Rank::FromIdx(dst_row));
         if (our_pieces_.get(destination)) {
           if (possible_pinned_piece_found) {
             // No pieces pinned.
@@ -815,18 +813,19 @@ KingAttackInfo ChessBoard::GenerateKingAttackInfo() const {
     }
   }
   // Check bishops.
-  if (kBishopAttacks[our_king_.as_int()].intersects(their_pieces_ & bishops_)) {
+  if (kBishopAttacks[our_king_.as_idx()].intersects(their_pieces_ & bishops_)) {
     for (const auto& direction : kBishopDirections) {
       auto dst_row = row;
       auto dst_col = col;
       BitBoard attack_line(0);
       bool possible_pinned_piece_found = false;
-      BoardSquare possible_pinned_piece;
+      Square possible_pinned_piece;
       while (true) {
         dst_row += direction.first;
         dst_col += direction.second;
-        if (!BoardSquare::IsValid(dst_row, dst_col)) break;
-        const BoardSquare destination(dst_row, dst_col);
+        if (!IsOnBoard(dst_row, dst_col)) break;
+        const Square destination(File::FromIdx(dst_col),
+                                 Rank::FromIdx(dst_row));
         if (our_pieces_.get(destination)) {
           if (possible_pinned_piece_found) {
             // No pieces pinned.
@@ -859,7 +858,7 @@ KingAttackInfo ChessBoard::GenerateKingAttackInfo() const {
   }
   // Check pawns.
   const BitBoard attacking_pawns =
-      kPawnAttacks[our_king_.as_int()] & their_pieces_ & pawns_;
+      kPawnAttacks[our_king_.as_idx()] & their_pieces_ & pawns_;
   king_attack_info.attack_lines_ =
       king_attack_info.attack_lines_ | attacking_pawns;
 
@@ -870,7 +869,7 @@ KingAttackInfo ChessBoard::GenerateKingAttackInfo() const {
 
   // Check knights.
   const BitBoard attacking_knights =
-      kKnightAttacks[our_king_.as_int()] &
+      kKnightAttacks[our_king_.as_idx()] &
       (their_pieces_ - their_king_ - rooks_ - bishops_ - (pawns_ & kPawnMask));
   king_attack_info.attack_lines_ =
       king_attack_info.attack_lines_ | attacking_knights;
@@ -893,8 +892,7 @@ bool ChessBoard::IsLegalMove(Move move,
 
   // En passant. Complex but rare. Just apply
   // and check that we are not under check.
-  if (from.row() == 4 && pawns_.get(from) && from.col() != to.col() &&
-      pawns_.get(7, to.col())) {
+  if (move.is_en_passant()) {
     ChessBoard board(*this);
     board.ApplyMove(move);
     return !board.IsUnderCheck();
@@ -930,8 +928,7 @@ bool ChessBoard::IsLegalMove(Move move,
 
   // King moves.
   if (from == our_king_) {
-    if (from.row() != 0 || to.row() != 0 ||
-        (abs(from.col() - to.col()) == 1 && !our_pieces_.get(to))) {
+    if (!move.is_castling()) {
       // Non-castling move. Already checked during movegen.
       return true;
     }
@@ -947,10 +944,10 @@ bool ChessBoard::IsLegalMove(Move move,
 
   // The piece is pinned. Now check that it stays on the same line w.r.t. the
   // king.
-  const int dx_from = from.col() - our_king_.col();
-  const int dy_from = from.row() - our_king_.row();
-  const int dx_to = to.col() - our_king_.col();
-  const int dy_to = to.row() - our_king_.row();
+  const int dx_from = from.file() - our_king_.file();
+  const int dy_from = from.rank() - our_king_.rank();
+  const int dx_to = to.file() - our_king_.file();
+  const int dy_to = to.rank() - our_king_.rank();
 
   if (dx_from == 0 || dx_to == 0) {
     return (dx_from == dx_to);
@@ -969,158 +966,156 @@ MoveList ChessBoard::GenerateLegalMoves() const {
   return result;
 }
 
-void ChessBoard::SetFromFen(std::string fen, int* rule50_ply, int* moves) {
+void ChessBoard::PutPiece(Square square, PieceType piece, bool is_theirs) {
+  (is_theirs ? their_pieces_ : our_pieces_).set(square);
+  if (piece == kKing) (is_theirs ? their_king_ : our_king_) = square;
+  if (piece == kPawn) pawns_.set(square);
+  if (piece == kRook || piece == kQueen) rooks_.set(square);
+  if (piece == kBishop || piece == kQueen) bishops_.set(square);
+}
+
+void ChessBoard::SetFromFen(std::string_view fen, int* rule50_ply, int* moves) {
   Clear();
-  int row = 7;
-  int col = 0;
-
-  // Remove any trailing whitespaces to detect eof after the last field.
-  fen.erase(std::find_if(fen.rbegin(), fen.rend(),
-                         [](char c) { return !std::isspace(c); })
-                .base(),
-            fen.end());
-
-  std::istringstream fen_str(fen);
-  std::string board;
-  fen_str >> board;
-  std::string who_to_move = "w";
-  if (!fen_str.eof()) fen_str >> who_to_move;
-  // Assume no castling rights. Other engines, e.g., Stockfish, assume kings and
-  // rooks on their initial rows can each castle with the outer-most rook.  Our
-  // implementation currently supports 960 castling where white and black rooks
-  // have matching columns, so it's unclear which rights to assume.
-  std::string castlings = "-";
-  if (!fen_str.eof()) fen_str >> castlings;
-  std::string en_passant = "-";
-  if (!fen_str.eof()) fen_str >> en_passant;
-  int rule50_halfmoves = 0;
-  if (!fen_str.eof()) fen_str >> rule50_halfmoves;
-  int total_moves = 1;
-  if (!fen_str.eof()) fen_str >> total_moves;
-  if (!fen_str) throw Exception("Bad fen string: " + fen);
-
-  for (char c : board) {
+  if (rule50_ply) *rule50_ply = 0;
+  if (moves) *moves = 1;
+  Rank rank = kRank8;
+  File file = kFileA;
+  size_t pos = 0;
+
+  auto complain = [&](std::string_view msg) {
+    throw Exception("Bad fen string (" + std::string(msg) +
+                    "): " + std::string(fen));
+  };
+  auto skip_whitespace = [&](std::string_view where = {}) {
+    if (!where.empty() && pos < fen.size() && fen[pos] != ' ') {
+      complain("space expected " + std::string(where));
+    }
+    while (pos < fen.size() && fen[pos] == ' ') ++pos;
+    return pos == fen.size();
+  };
+
+  // Skip leading whitespaces.
+  skip_whitespace();
+
+  // Parse board position.
+  for (; pos < fen.size(); ++pos) {
+    const char c = fen[pos];
+    if (c == ' ') break;
     if (c == '/') {
-      --row;
-      if (row < 0) throw Exception("Bad fen string (too many rows): " + fen);
-      col = 0;
+      if (rank == kRank1) complain("too many ranks");
+      --rank;
+      file = kFileA;
       continue;
     }
-    if (std::isdigit(c)) {
-      col += c - '0';
+    if (c >= '1' && c <= '8') {
+      file += c - '0';
+      if (file > File::FromIdx(8)) complain("too many files");
       continue;
     }
-    if (col >= 8) throw Exception("Bad fen string (too many columns): " + fen);
-
-    if (std::isupper(c)) {
-      // White piece.
-      our_pieces_.set(row, col);
-    } else {
-      // Black piece.
-      their_pieces_.set(row, col);
+    PieceType piece = PieceType::Parse(c);
+    if (!piece.IsValid()) complain("invalid character as piece");
+    if (!file.IsValid() || !rank.IsValid()) complain("piece out of board");
+    if (piece == kPawn && (rank == kRank1 || rank == kRank8)) {
+      complain("pawn on back rank");
     }
+    PutPiece(Square(file, rank), piece, std::islower(c));
+    ++file;
+  }
+  if (skip_whitespace("after the board")) return;
 
-    if (c == 'K') {
-      our_king_.set(row, col);
-    } else if (c == 'k') {
-      their_king_.set(row, col);
-    } else if (c == 'R' || c == 'r') {
-      rooks_.set(row, col);
-    } else if (c == 'B' || c == 'b') {
-      bishops_.set(row, col);
-    } else if (c == 'Q' || c == 'q') {
-      rooks_.set(row, col);
-      bishops_.set(row, col);
-    } else if (c == 'P' || c == 'p') {
-      if (row == 7 || row == 0) {
-        throw Exception("Bad fen string (pawn in first/last row): " + fen);
-      }
-      pawns_.set(row, col);
-    } else if (c == 'N' || c == 'n') {
-      // Do nothing
-    } else {
-      throw Exception("Bad fen string: " + fen);
-    }
-    ++col;
+  // Parsing side to move.
+  const char side_to_move = std::tolower(fen[pos++]);
+  if (side_to_move == 'b') {
+    Mirror();
+  } else if (side_to_move != 'w') {
+    complain("invalid side to move");
   }
+  if (skip_whitespace("after side to move")) return;
 
-  if (castlings != "-") {
-    uint8_t our_left_rook = FILE_A;
-    uint8_t our_right_rook = FILE_H;
-    uint8_t their_left_rook = FILE_A;
-    uint8_t their_right_rook = FILE_H;
-    for (char c : castlings) {
-      const bool is_black = std::islower(c);
-      const int king_col = (is_black ? their_king_ : our_king_).col();
-      const auto rooks =
-          (is_black ? their_pieces_ : our_pieces_) & ChessBoard::rooks();
-      auto find_rook = [rooks, king_col, fen](bool forward, uint8_t rank) {
-        uint8_t rook;
-        for (rook = forward ? FILE_A : FILE_H; rook != king_col;
-             rook += 2 * forward - 1) {
-          if (rooks.get(rank, rook)) break;
-        }
-        if (rook == king_col) {
-          throw Exception("Bad fen string (missing rook): " + fen);
-        }
-        return rook;
-      };
-      if (c == 'K') {
-        // Finding rightmost rook.
-        our_right_rook = find_rook(false, RANK_1);
-        castlings_.set_we_can_00();
-      } else if (c == 'Q') {
-        // Finding leftmost rook.
-        our_left_rook = find_rook(true, RANK_1);
-        castlings_.set_we_can_000();
-      } else if (c >= 'A' && c <= 'H') {
-        int rook_col = c - 'A';
-        if (rook_col < king_col) {
-          our_left_rook = rook_col;
-          castlings_.set_we_can_000();
-        } else {
-          our_right_rook = rook_col;
-          castlings_.set_we_can_00();
+  // Parse castling rights.
+  if (fen[pos] == '-') {
+    ++pos;
+  } else {
+    auto find_rook = [&](bool theirs, bool kingside) -> File {
+      const Rank rank = theirs ? kRank8 : kRank1;
+      for (File file = kingside ? kFileH : kFileA;
+           file != (theirs ? their_king_.file() : our_king_.file());
+           kingside ? --file : ++file) {
+        Square sq(file, rank);
+        if (!rooks().get(sq)) continue;
+        if (theirs ? their_pieces_.get(sq) : our_pieces_.get(sq)) {
+          return file;
         }
-      } else if (c == 'k') {
-        // Finding rightmost rook.
-        their_right_rook = find_rook(false, RANK_8);
+      }
+      complain("missing rook for castling");
+      return kFileA;  // Unreachable.
+    };
+    for (; pos < fen.size(); ++pos) {
+      const char c = fen[pos];
+      if (c == ' ') break;
+      const bool theirs = bool(std::isupper(c)) == flipped();
+      bool kingside = false;
+      File file;
+      if (c == 'K' || c == 'Q' || c == 'k' || c == 'q') {
+        kingside = std::tolower(c) == 'k';
+        file = find_rook(theirs, kingside);
+      } else {
+        file = File::Parse(c);
+        if (!file.IsValid()) complain("invalid character in castling");
+        kingside = file > (theirs ? their_king_.file() : our_king_.file());
+      }
+      if (kingside && theirs) {
         castlings_.set_they_can_00();
-      } else if (c == 'q') {
-        // Finding leftmost rook.
-        their_left_rook = find_rook(true, RANK_8);
+        castlings_.their_kingside_rook = file;
+      } else if (kingside && !theirs) {
+        castlings_.set_we_can_00();
+        castlings_.our_kingside_rook = file;
+      } else if (!kingside && theirs) {
         castlings_.set_they_can_000();
-      } else if (c >= 'a' && c <= 'h') {
-        int rook_col = c - 'a';
-        if (rook_col < king_col) {
-          their_left_rook = rook_col;
-          castlings_.set_they_can_000();
-        } else {
-          their_right_rook = rook_col;
-          castlings_.set_they_can_00();
-        }
-      } else {
-        throw Exception("Bad fen string (unexpected casting symbol): " + fen);
+        castlings_.their_queenside_rook = file;
+      } else if (!kingside && !theirs) {
+        castlings_.set_we_can_000();
+        castlings_.our_queenside_rook = file;
       }
     }
-    castlings_.SetRookPositions(our_left_rook, our_right_rook, their_left_rook,
-                                their_right_rook);
-  }
-
-  if (en_passant != "-") {
-    auto square = BoardSquare(en_passant);
-    if (square.row() != RANK_3 && square.row() != RANK_6)
-      throw Exception("Bad fen string: " + fen + " wrong en passant rank");
-    pawns_.set((square.row() == RANK_3) ? RANK_1 : RANK_8, square.col());
   }
+  if (skip_whitespace("after castling")) return;
 
-  if (who_to_move == "b" || who_to_move == "B") {
-    Mirror();
-  } else if (who_to_move != "w" && who_to_move != "W") {
-    throw Exception("Bad fen string (side to move): " + fen);
+  // Parse en passant square.
+  if (fen[pos] == '-') {
+    ++pos;
+  } else {
+    if (pos + 2 >= fen.size()) complain("en passant square expected");
+    const File file = File::Parse(fen[pos]);
+    const Rank rank = Rank::Parse(fen[pos + 1]);
+    if (!file.IsValid() || !rank.IsValid()) complain("bad en passant square");
+    if (rank != (flipped() ? kRank3 : kRank6)) complain("bad en passant rank");
+    if ((ours() | theirs()).get(Square(file, kRank6))) {
+      complain("en passant square occupied");
+    }
+    if (!(theirs() & pawns()).get(Square(file, kRank5))) {
+      complain("no pawn to capture en passant");
+    }
+    pawns_.set(Square(file, kRank8));
+    pos += 2;
   }
-  if (rule50_ply) *rule50_ply = rule50_halfmoves;
-  if (moves) *moves = total_moves;
+  if (skip_whitespace("after en passant")) return;
+
+  // Parse rule 50 halfmoves.
+  auto parse_int = [&](int* into, std::string_view error_msg) {
+    const std::string_view num = fen.substr(pos, fen.find(' ', pos) - pos);
+    int tmp;
+    auto res = std::from_chars(num.data(), num.data() + num.size(), tmp);
+    if (res.ec != std::errc()) complain(error_msg);
+    if (into) *into = tmp;
+    pos += num.size();
+  };
+  parse_int(rule50_ply, "bad rule 50 halfmoves");
+  if (skip_whitespace("after rule-50 clock")) return;
+
+  // Parse total moves.
+  parse_int(moves, "bad total moves");
+  if (!skip_whitespace("after total moves")) complain("extra characters");
 }
 
 bool ChessBoard::HasMatingMaterial() const {
@@ -1147,49 +1142,117 @@ bool ChessBoard::HasMatingMaterial() const {
 }
 
 std::string ChessBoard::DebugString() const {
+  auto fen = BoardToFen(*this);
+  std::replace(fen.begin(), fen.end(), ' ', '_');
+  return "https://lc0.org/fen/" + fen;
+}
+
+Move ChessBoard::ParseMove(std::string_view move_str) const {
+  auto complain = [&move_str](std::string_view reason) {
+    throw Exception("Invalid move (" + std::string(reason) +
+                    "): " + std::string(move_str));
+  };
+  if (move_str.size() < 4 || move_str.size() > 5) complain("wrong move size");
+  File from_file = File::Parse(move_str[0]);
+  Rank from_rank = Rank::Parse(move_str[1]);
+  File to_file = File::Parse(move_str[2]);
+  Rank to_rank = Rank::Parse(move_str[3]);
+  if (!from_file.IsValid() || !from_rank.IsValid() || !to_file.IsValid() ||
+      !to_rank.IsValid()) {
+    complain("bad square");
+  }
+  if (flipped_) {
+    from_rank.Flip();
+    to_rank.Flip();
+  }
+  Square from(from_file, from_rank);
+  Square to(to_file, to_rank);
+  if (!our_pieces_.get(from)) complain("no piece to move");
+
+  // Pawns at back ranks are used to encode en-passant, that's why we need to
+  // check that a piece doesn't go from there.
+  if (pawns_.get(from) && (from_rank != kRank1 && from_rank != kRank8) &&
+      (to_rank == kRank1 || to_rank == kRank8)) {
+    // Promotion.
+    PieceType promotion =
+        move_str.size() > 4 ? PieceType::Parse(move_str[4]) : kKnight;
+    if (!promotion.CanPromoteInto()) complain("invalid promotion");
+    return Move::WhitePromotion(from, to, promotion);
+  }
+  if (from == our_king_ && our_pieces_.get(to)) {
+    // FRC-style castling.
+    return Move::WhiteCastling(from.file(), to.file());
+  }
+  if (from == our_king_ && from == kSquareE1 && to == kSquareG1) {
+    // Kingside castling.
+    return Move::WhiteCastling(from.file(), kFileH);
+  }
+  if (from == our_king_ && from == kSquareE1 && to == kSquareC1) {
+    // Qeenside castling.
+    return Move::WhiteCastling(from.file(), kFileA);
+  }
+  if (from.file() != to.file() && pawns().get(from) && !their_pieces_.get(to)) {
+    // En passant.
+    return Move::WhiteEnPassant(from, to);
+  }
+  return Move::White(from, to);
+}
+
+namespace {
+char GetPieceAt(const lczero::ChessBoard& board, Square square) {
+  char c = '\0';
+  if (board.ours().get(square) || board.theirs().get(square)) {
+    if (board.pawns().get(square)) {
+      c = 'P';
+    } else if (board.kings().get(square)) {
+      c = 'K';
+    } else if (board.bishops().get(square)) {
+      c = 'B';
+    } else if (board.queens().get(square)) {
+      c = 'Q';
+    } else if (board.rooks().get(square)) {
+      c = 'R';
+    } else {
+      c = 'N';
+    }
+    if (board.theirs().get(square)) {
+      c = std::tolower(c);  // Capitals are for white.
+    }
+  }
+  return c;
+}
+
+}  // namespace
+
+std::string BoardToFen(const ChessBoard& in_board) {
+  ChessBoard board(in_board);
+  const bool black_to_move = board.flipped();
+  if (black_to_move) board.Mirror();
   std::string result;
-  for (int i = 7; i >= 0; --i) {
-    for (int j = 0; j < 8; ++j) {
-      if (!our_pieces_.get(i, j) && !their_pieces_.get(i, j)) {
-        if (i == 2 && pawns_.get(0, j))
-          result += '*';
-        else if (i == 5 && pawns_.get(7, j))
-          result += '*';
-        else
-          result += '.';
-        continue;
-      }
-      if (our_king_ == i * 8 + j) {
-        result += 'K';
-        continue;
-      }
-      if (their_king_ == i * 8 + j) {
-        result += 'k';
-        continue;
-      }
-      char c = '?';
-      if ((pawns_ & kPawnMask).get(i, j)) {
-        c = 'p';
-      } else if (bishops_.get(i, j)) {
-        if (rooks_.get(i, j))
-          c = 'q';
-        else
-          c = 'b';
-      } else if (rooks_.get(i, j)) {
-        c = 'r';
+  for (Rank rank = kRank8; rank.IsValid(); --rank) {
+    int empty = 0;
+    for (File file = kFileA; file <= kFileH; ++file) {
+      Square square(file, rank);
+      char piece = GetPieceAt(board, square);
+      if (piece) {
+        if (empty) result += std::to_string(empty);
+        empty = 0;
+        result += piece;
       } else {
-        c = 'n';
+        ++empty;
       }
-      if (our_pieces_.get(i, j)) c = std::toupper(c);
-      result += c;
     }
-    if (i == 0) {
-      result += " " + castlings_.DebugString();
-      result += flipped_ ? " (from black's eyes)" : " (from white's eyes)";
-      result += " Hash: " + std::to_string(Hash());
-    }
-    result += '\n';
+    if (empty) result += std::to_string(empty);
+    if (rank != kRank1) result += '/';
+  }
+  result += black_to_move ? " b" : " w";
+  result += " " + board.castlings().as_string();
+  std::string ep = "-";
+  if (!board.en_passant().empty()) {
+    const Square sq = *board.en_passant().begin();
+    ep = Square(sq.file(), black_to_move ? kRank3 : kRank6).ToString(false);
   }
+  result += " " + ep;
   return result;
 }
 
diff --git a/src/chess/board.h b/src/chess/board.h
index 40cbfc16b4..d455fcb69d 100644
--- a/src/chess/board.h
+++ b/src/chess/board.h
@@ -31,6 +31,7 @@
 #include <string>
 
 #include "chess/bitboard.h"
+#include "chess/types.h"
 #include "utils/hashcat.h"
 
 namespace lczero {
@@ -43,10 +44,10 @@ class KingAttackInfo {
  public:
   bool in_check() const { return attack_lines_.as_int(); }
   bool in_double_check() const { return double_check_; }
-  bool is_pinned(const BoardSquare square) const {
+  bool is_pinned(const Square square) const {
     return pinned_pieces_.get(square);
   }
-  bool is_on_attack_line(const BoardSquare square) const {
+  bool is_on_attack_line(const Square square) const {
     return attack_lines_.get(square);
   }
 
@@ -73,7 +74,7 @@ class ChessBoard {
   // If @rule50_ply and @moves are not nullptr, they are filled with number
   // of moves without capture and number of full moves since the beginning of
   // the game.
-  void SetFromFen(std::string fen, int* rule50_ply = nullptr,
+  void SetFromFen(std::string_view fen, int* rule50_ply = nullptr,
                   int* moves = nullptr);
   // Nullifies the whole structure.
   void Clear();
@@ -89,7 +90,7 @@ class ChessBoard {
   // counter should be removed.
   bool ApplyMove(Move move);
   // Checks if the square is under attack from "theirs" (black).
-  bool IsUnderAttack(BoardSquare square) const;
+  bool IsUnderAttack(Square square) const;
   // Generates the king attack info used for legal move detection.
   KingAttackInfo GenerateKingAttackInfo() const;
   // Checks if "our" (white) king is under check.
@@ -101,18 +102,20 @@ class ChessBoard {
   MoveList GenerateLegalMoves() const;
   // Check whether pseudolegal move is legal.
   bool IsLegalMove(Move move, const KingAttackInfo& king_attack_info) const;
-  // Returns whether two moves are actually the same move in the position.
-  bool IsSameMove(Move move1, Move move2) const;
-  // Returns the same move but with castling encoded in legacy way.
-  Move GetLegacyMove(Move move) const;
-  // Returns the same move but with castling encoded in modern way.
-  Move GetModernMove(Move move) const;
+
+  // Parses a move from move_str.
+  // The input string should be in the "normal" notation rather than from the
+  // player to move, i.e. "e7e5" for the black pawn move.
+  // Output is currently "from the player to move" perspective (i.e. from=E2,
+  // to=E4 for the same black move). This is temporary, plan is to change it
+  // soon.
+  Move ParseMove(std::string_view move_str) const;
 
   uint64_t Hash() const {
     return HashCat({our_pieces_.as_int(), their_pieces_.as_int(),
                     rooks_.as_int(), bishops_.as_int(), pawns_.as_int(),
-                    (static_cast<uint32_t>(our_king_.as_int()) << 24) |
-                        (static_cast<uint32_t>(their_king_.as_int()) << 16) |
+                    (static_cast<uint32_t>(our_king_.as_idx()) << 24) |
+                        (static_cast<uint32_t>(their_king_.as_idx()) << 16) |
                         (static_cast<uint32_t>(castlings_.as_int()) << 8) |
                         static_cast<uint32_t>(flipped_)});
   }
@@ -120,10 +123,10 @@ class ChessBoard {
   class Castlings {
    public:
     Castlings()
-        : our_queenside_rook_(FILE_A),
-          their_queenside_rook_(FILE_A),
-          our_kingside_rook_(FILE_H),
-          their_kingside_rook_(FILE_H),
+        : our_queenside_rook(kFileA),
+          their_queenside_rook(kFileA),
+          our_kingside_rook(kFileH),
+          their_kingside_rook(kFileH),
           data_(0) {}
 
     void set_we_can_00() { data_ |= 1; }
@@ -143,8 +146,8 @@ class ChessBoard {
     bool no_legal_castle() const { return data_ == 0; }
 
     void Mirror() {
-      std::swap(our_queenside_rook_, their_queenside_rook_);
-      std::swap(our_kingside_rook_, their_kingside_rook_);
+      std::swap(our_queenside_rook, their_queenside_rook);
+      std::swap(our_kingside_rook, their_kingside_rook);
       data_ = ((data_ & 0b11) << 2) + ((data_ & 0b1100) >> 2);
     }
 
@@ -154,17 +157,17 @@ class ChessBoard {
     std::string as_string() const {
       if (data_ == 0) return "-";
       std::string result;
-      if (our_queenside_rook() == FILE_A && our_kingside_rook() == FILE_H &&
-          their_queenside_rook() == FILE_A && their_kingside_rook() == FILE_H) {
+      if (our_queenside_rook == kFileA && our_kingside_rook == kFileH &&
+          their_queenside_rook == kFileA && their_kingside_rook == kFileH) {
         if (we_can_00()) result += 'K';
         if (we_can_000()) result += 'Q';
         if (they_can_00()) result += 'k';
         if (they_can_000()) result += 'q';
       } else {
-        if (we_can_00()) result += 'A' + our_kingside_rook();
-        if (we_can_000()) result += 'A' + our_queenside_rook();
-        if (they_can_00()) result += 'a' + their_kingside_rook();
-        if (they_can_000()) result += 'a' + their_queenside_rook();
+        if (we_can_00()) result += our_kingside_rook.ToString(true);
+        if (we_can_000()) result += our_queenside_rook.ToString(true);
+        if (they_can_00()) result += their_kingside_rook.ToString(false);
+        if (they_can_000()) result += their_queenside_rook.ToString(false);
       }
       return result;
     }
@@ -177,44 +180,25 @@ class ChessBoard {
       if (they_can_00()) result += 'k';
       if (they_can_000()) result += 'q';
       result += '[';
-      result += 'A' + our_queenside_rook();
-      result += 'A' + our_kingside_rook();
-      result += 'a' + their_queenside_rook();
-      result += 'a' + their_kingside_rook();
+      result += our_queenside_rook.ToString(true);
+      result += our_kingside_rook.ToString(true);
+      result += their_queenside_rook.ToString(false);
+      result += their_kingside_rook.ToString(false);
       result += ']';
       return result;
     }
 
     uint8_t as_int() const { return data_; }
+    bool operator==(const Castlings& other) const = default;
 
-    bool operator==(const Castlings& other) const {
-      assert(our_queenside_rook_ == other.our_queenside_rook_ &&
-             our_kingside_rook_ == other.our_kingside_rook_ &&
-             their_queenside_rook_ == other.their_queenside_rook_ &&
-             their_kingside_rook_ == other.their_kingside_rook_);
-      return data_ == other.data_;
-    }
-
-    uint8_t our_queenside_rook() const { return our_queenside_rook_; }
-    uint8_t our_kingside_rook() const { return our_kingside_rook_; }
-    uint8_t their_queenside_rook() const { return their_queenside_rook_; }
-    uint8_t their_kingside_rook() const { return their_kingside_rook_; }
-    void SetRookPositions(uint8_t our_left, uint8_t our_right,
-                          uint8_t their_left, uint8_t their_right) {
-      our_queenside_rook_ = our_left;
-      our_kingside_rook_ = our_right;
-      their_queenside_rook_ = their_left;
-      their_kingside_rook_ = their_right;
-    }
-
-   private:
     // Position of "left" (queenside) rook in starting game position.
-    uint8_t our_queenside_rook_;
-    uint8_t their_queenside_rook_;
+    File our_queenside_rook;
+    File their_queenside_rook;
     // Position of "right" (kingside) rook in starting position.
-    uint8_t our_kingside_rook_;
-    uint8_t their_kingside_rook_;
+    File our_kingside_rook;
+    File their_kingside_rook;
 
+   private:
     // - Bit 0 -- "our" side's kingside castle.
     // - Bit 1 -- "our" side's queenside castle.
     // - Bit 2 -- opponent's side's kingside castle.
@@ -236,48 +220,20 @@ class ChessBoard {
            rooks_ - bishops_;
   }
   BitBoard kings() const {
-    return our_king_.as_board() | their_king_.as_board();
+    return BitBoard::FromSquare(our_king_) | BitBoard::FromSquare(their_king_);
   }
   const Castlings& castlings() const { return castlings_; }
   bool flipped() const { return flipped_; }
 
-  bool operator==(const ChessBoard& other) const {
-    return (our_pieces_ == other.our_pieces_) &&
-           (their_pieces_ == other.their_pieces_) && (rooks_ == other.rooks_) &&
-           (bishops_ == other.bishops_) && (pawns_ == other.pawns_) &&
-           (our_king_ == other.our_king_) &&
-           (their_king_ == other.their_king_) &&
-           (castlings_ == other.castlings_) && (flipped_ == other.flipped_);
-  }
-
-  bool operator!=(const ChessBoard& other) const { return !operator==(other); }
-
-  enum Square : uint8_t {
-    // clang-format off
-    A1 = 0, B1, C1, D1, E1, F1, G1, H1,
-    A2, B2, C2, D2, E2, F2, G2, H2,
-    A3, B3, C3, D3, E3, F3, G3, H3,
-    A4, B4, C4, D4, E4, F4, G4, H4,
-    A5, B5, C5, D5, E5, F5, G5, H5,
-    A6, B6, C6, D6, E6, F6, G6, H6,
-    A7, B7, C7, D7, E7, F7, G7, H7,
-    A8, B8, C8, D8, E8, F8, G8, H8,
-    // clang-format on
-  };
-
-  enum File : uint8_t {
-    // clang-format off
-    FILE_A = 0, FILE_B, FILE_C, FILE_D, FILE_E, FILE_F, FILE_G, FILE_H
-    // clang-format on
-  };
-
-  enum Rank : uint8_t {
-    // clang-format off
-    RANK_1 = 0, RANK_2, RANK_3, RANK_4, RANK_5, RANK_6, RANK_7, RANK_8
-    // clang-format on
-  };
+  bool operator==(const ChessBoard& other) const = default;
+  bool operator!=(const ChessBoard& other) const = default;
 
  private:
+  // Sets the piece on the square.
+  void PutPiece(Square square, PieceType piece, bool is_theirs);
+  // Check internal state is consistent after state transformations.
+  bool IsValid() const;
+
   // All white pieces.
   BitBoard our_pieces_;
   // All black pieces.
@@ -292,10 +248,13 @@ class ChessBoard {
   // same for black pawns. Those "fake" pawns are not present in our_pieces_ and
   // their_pieces_ bitboards.
   BitBoard pawns_;
-  BoardSquare our_king_;
-  BoardSquare their_king_;
+  Square our_king_;
+  Square their_king_;
   Castlings castlings_;
   bool flipped_ = false;  // aka "Black to move".
 };
 
+// Converts the board to FEN string.
+std::string BoardToFen(const ChessBoard& board);
+
 }  // namespace lczero
diff --git a/src/chess/board_test.cc b/src/chess/board_test.cc
index 123574482d..eef6247d1d 100644
--- a/src/chess/board_test.cc
+++ b/src/chess/board_test.cc
@@ -23,42 +23,15 @@
 #include <iostream>
 
 #include "chess/bitboard.h"
-
 #include "utils/exception.h"
 
 namespace lczero {
 
-TEST(BoardSquare, BoardSquare) {
-  {
-    auto x = BoardSquare(ChessBoard::C2);
-    EXPECT_EQ(x.row(), 1);
-    EXPECT_EQ(x.col(), 2);
-  }
-
-  {
-    auto x = BoardSquare("c2");
-    EXPECT_EQ(x.row(), 1);
-    EXPECT_EQ(x.col(), 2);
-  }
-
-  {
-    auto x = BoardSquare(1, 2);
-    EXPECT_EQ(x.row(), 1);
-    EXPECT_EQ(x.col(), 2);
-  }
-
-  {
-    auto x = BoardSquare(1, 2);
-    x.Mirror();
-    EXPECT_EQ(x.row(), 6);
-    EXPECT_EQ(x.col(), 2);
-  }
-}
-
 TEST(ChessBoard, IllegalFirstRankPawns) {
   ChessBoard board;
-  EXPECT_THROW(board.SetFromFen("nqrbkrnr/bnnbnbnn/8/8/8/8/NNNBPNBN/QNRPKPQQ w - - 0 1");,
-               Exception);
+  EXPECT_THROW(
+      board.SetFromFen("nqrbkrnr/bnnbnbnn/8/8/8/8/NNNBPNBN/QNRPKPQQ w - - 0 1");
+      , Exception);
 }
 
 TEST(ChessBoard, PseudolegalMovesStartingPos) {
@@ -109,26 +82,26 @@ int Perft(const ChessBoard& board, int max_depth, bool dump = false,
     new_board.ApplyMove(move);
     if (new_board.IsUnderCheck()) {
       if (iter != legal_moves.end()) {
-        EXPECT_NE(iter->as_packed_int(), move.as_packed_int())
-            << board.DebugString() << "legal:[" << iter->as_string()
-            << "]==pseudo:(" << move.as_string() << ") Under check:\n"
+        EXPECT_NE(*iter, move)
+            << board.DebugString() << "legal:[" << iter->ToString(true)
+            << "]==pseudo:(" << move.ToString(true) << ") Under check:\n"
             << new_board.DebugString();
       }
       continue;
     }
 
-    EXPECT_EQ(iter->as_packed_int(), move.as_packed_int())
-        << board.DebugString() << "legal:[" << iter->as_string() << "]pseudo:("
-        << move.as_string() << ") after:\n"
-        << new_board.DebugString();
+    EXPECT_EQ(*iter, move) << board.DebugString() << "legal:["
+                           << iter->ToString(true) << "]pseudo:("
+                           << move.ToString(true) << ") after:\n"
+                           << new_board.DebugString();
 
     new_board.Mirror();
     ++iter;
     int count = Perft(new_board, max_depth, dump, depth + 1);
     if (dump && depth == 0) {
       Move m = move;
-      if (board.flipped()) m.Mirror();
-      std::cerr << m.as_string() << ": " << count << '\n';
+      if (board.flipped()) m.Flip();
+      std::cerr << m.ToString(true) << ": " << count << '\n';
     }
     total_count += count;
   }
@@ -2235,26 +2208,6 @@ TEST(ChessBoard, HasMatingMaterialMultipleBishopsNotSameColor) {
   EXPECT_TRUE(board.HasMatingMaterial());
 }
 
-TEST(ChessBoard, CastlingIsSameMove) {
-  ChessBoard board;
-  board.SetFromFen(
-      "r3k2r/ppp1bppp/2np1n2/4p1B1/4P1b1/2NP1N2/PPP1BPPP/R3K2R w KQkq - 0 1");
-  EXPECT_TRUE(board.IsSameMove("e1c1", "e1c1"));
-  EXPECT_TRUE(board.IsSameMove("e1a1", "e1a1"));
-  EXPECT_TRUE(board.IsSameMove("e1c1", "e1a1"));
-  EXPECT_FALSE(board.IsSameMove("e1c1", "e1b1"));
-  EXPECT_FALSE(board.IsSameMove("e1b1", "e1a1"));
-  EXPECT_FALSE(board.IsSameMove("e1c1", "e1g1"));
-  EXPECT_FALSE(board.IsSameMove("e1a1", "e1h1"));
-  EXPECT_FALSE(board.IsSameMove("e1c1", "e1h1"));
-  EXPECT_FALSE(board.IsSameMove("e1a1", "e1g1"));
-  EXPECT_FALSE(board.IsSameMove("e1f1", "e1g1"));
-  EXPECT_FALSE(board.IsSameMove("e1f1", "e1h1"));
-  EXPECT_TRUE(board.IsSameMove("e2c2", "e2c2"));
-  EXPECT_TRUE(board.IsSameMove("e2a2", "e2a2"));
-  EXPECT_FALSE(board.IsSameMove("e2c2", "e2a2"));
-}
-
 namespace {
 void TestInvalid(std::string fen) {
   ChessBoard board;
@@ -2267,7 +2220,6 @@ void TestInvalid(std::string fen) {
 }
 }  // namespace
 
-
 TEST(ChessBoard, InvalidFEN) {
   TestInvalid("rnbqkbnr/ppppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1");
   TestInvalid("rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR/8 w KQkq - 0 1");
@@ -2280,10 +2232,24 @@ TEST(ChessBoard, InvalidFEN) {
 TEST(ChessBoard, InvalidEnPassantFromKnightPromotion) {
   ChessBoard board;
   board.SetFromFen("Q3b3/2P2pnk/3R3p/p7/1pp1p3/PnP1P2P/2B2PP1/5RK1 w - - 1 31");
-  board.ApplyMove(Move("c7c8"));
+  board.ApplyMove(board.ParseMove("c7c8"));
   EXPECT_TRUE(board.en_passant().empty());
 }
 
+// Move from an en-passant flag square was mistakenly marked as en-passant.
+TEST(ChessBoard, QueenMoveFromEnPassantFlagBug) {
+  ChessBoard board;
+  board.SetFromFen("1Qnkr3/1p1b4/p2P2p1/P1q5/1NP3pP/1KN5/8/3R4 b - - 0 32");
+  board.ApplyMove(board.ParseMove("b7b5"));
+  board.Mirror();
+  auto m = board.ParseMove("b8c7");
+  EXPECT_FALSE(m.is_en_passant());
+  board.ApplyMove(m);
+  board.Mirror();
+  MoveList legal_moves = {board.ParseMove("c5c7")};
+  EXPECT_EQ(board.GenerateLegalMoves(), legal_moves);
+}
+
 }  // namespace lczero
 
 int main(int argc, char** argv) {
diff --git a/src/chess/callbacks.h b/src/chess/callbacks.h
index 7e28d271b1..4205e2441a 100644
--- a/src/chess/callbacks.h
+++ b/src/chess/callbacks.h
@@ -35,6 +35,7 @@
 
 #include "chess/bitboard.h"
 #include "chess/position.h"
+#include "utils/exception.h"
 
 namespace lczero {
 
@@ -65,27 +66,29 @@ struct ThinkingInfo {
   int64_t nodes = -1;
   // Nodes per second.
   int nps = -1;
+  // Evaluations per second.
+  int eps = -1;
   // Hash fullness * 1000
   int hashfull = -1;
   // Moves to mate.
-  std::optional<int> mate;
+  std::optional<int> mate = std::nullopt;
   // Win in centipawns.
-  std::optional<int> score;
+  std::optional<int> score = std::nullopt;
   // Win/Draw/Lose probability * 1000.
   struct WDL {
     int w;
     int d;
     int l;
   };
-  std::optional<WDL> wdl;
+  std::optional<WDL> wdl = std::nullopt;
   // Number of successful TB probes (not the same as playouts ending in TB hit).
   int tb_hits = -1;
   // Best line found. Moves are from perspective of white player.
-  std::vector<Move> pv;
+  std::vector<Move> pv = {};
   // Multipv index.
   int multipv = -1;
   // Freeform comment.
-  std::string comment;
+  std::string comment = "";
 
   // Those are extensions and not really UCI protocol.
   // 1 if it's "player1", 2 if it's "player2"
@@ -93,9 +96,9 @@ struct ThinkingInfo {
   // Index of the game in the tournament (0-based).
   int game_id = -1;
   // The color of the player, if known.
-  std::optional<bool> is_black;
+  std::optional<bool> is_black = std::nullopt;
   // Moves left
-  std::optional<int> moves_left;
+  std::optional<int> moves_left = std::nullopt;
 };
 
 // Is sent when a single game is finished.
@@ -144,6 +147,34 @@ class UciResponder {
   virtual void OutputThinkingInfo(std::vector<ThinkingInfo>* infos) = 0;
 };
 
+// The responder which forwards the output to another responder, with
+// observer-like subscription model.
+class UciResponderForwarder : public UciResponder {
+ public:
+  void OutputBestMove(BestMoveInfo* info) override {
+    if (wrapped_) wrapped_->OutputBestMove(info);
+  }
+  void OutputThinkingInfo(std::vector<ThinkingInfo>* infos) override {
+    if (wrapped_) wrapped_->OutputThinkingInfo(infos);
+  }
+  void Register(UciResponder* wrapped) {
+    if (wrapped_) {
+      throw Exception("UciResponderForwarder already has a wrapped responder");
+    }
+    wrapped_ = wrapped;
+  }
+  void Unregister(UciResponder* wrapped) {
+    if (wrapped_ != wrapped) {
+      throw Exception(
+          "UciResponderForwarder doesn't have this wrapped responder");
+    }
+    wrapped_ = nullptr;
+  }
+
+ private:
+  UciResponder* wrapped_ = nullptr;
+};
+
 // The responder which calls callbacks. Used for easier transition from old
 // code.
 class CallbackUciResponder : public UciResponder {
@@ -202,49 +233,4 @@ class TransformingUciResponder : public UciResponder {
   std::unique_ptr<UciResponder> parent_;
 };
 
-class WDLResponseFilter : public TransformingUciResponder {
-  using TransformingUciResponder::TransformingUciResponder;
-  void TransformThinkingInfo(std::vector<ThinkingInfo>* infos) override {
-    for (auto& info : *infos) info.wdl.reset();
-  }
-};
-
-class MovesLeftResponseFilter : public TransformingUciResponder {
-  using TransformingUciResponder::TransformingUciResponder;
-  void TransformThinkingInfo(std::vector<ThinkingInfo>* infos) override {
-    for (auto& info : *infos) info.moves_left.reset();
-  }
-};
-
-// Remaps FRC castling to legacy castling.
-class Chess960Transformer : public TransformingUciResponder {
- public:
-  Chess960Transformer(std::unique_ptr<UciResponder> parent,
-                      ChessBoard head_board)
-      : TransformingUciResponder(std::move(parent)), head_board_(head_board) {}
-
- private:
-  void TransformBestMove(BestMoveInfo* best_move) override {
-    std::vector<Move> moves({best_move->bestmove, best_move->ponder});
-    ConvertToLegacyCastling(head_board_, &moves);
-    best_move->bestmove = moves[0];
-    best_move->ponder = moves[1];
-  }
-  void TransformThinkingInfo(std::vector<ThinkingInfo>* infos) override {
-    for (auto& x : *infos) ConvertToLegacyCastling(head_board_, &x.pv);
-  }
-  static void ConvertToLegacyCastling(ChessBoard pos,
-                                      std::vector<Move>* moves) {
-    for (auto& move : *moves) {
-      if (pos.flipped()) move.Mirror();
-      move = pos.GetLegacyMove(move);
-      pos.ApplyMove(move);
-      if (pos.flipped()) move.Mirror();
-      pos.Mirror();
-    }
-  }
-
-  const ChessBoard head_board_;
-};
-
 }  // namespace lczero
diff --git a/src/chess/gamestate.cc b/src/chess/gamestate.cc
new file mode 100644
index 0000000000..faaea02c7f
--- /dev/null
+++ b/src/chess/gamestate.cc
@@ -0,0 +1,52 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2024 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include "chess/gamestate.h"
+
+#include <algorithm>
+#include <numeric>
+
+namespace lczero {
+
+Position GameState::CurrentPosition() const {
+  return std::accumulate(
+      moves.begin(), moves.end(), startpos,
+      [](const Position& pos, Move m) { return Position(pos, m); });
+}
+
+std::vector<Position> GameState::GetPositions() const {
+  std::vector<Position> positions;
+  positions.reserve(moves.size() + 1);
+  positions.push_back(startpos);
+  std::transform(moves.begin(), moves.end(), std::back_inserter(positions),
+                 [&](Move m) {
+                   return Position(positions.back(), m);
+                 });
+  return positions;
+}
+
+}  // namespace lczero
diff --git a/src/chess/gamestate.h b/src/chess/gamestate.h
new file mode 100644
index 0000000000..e584b2d30b
--- /dev/null
+++ b/src/chess/gamestate.h
@@ -0,0 +1,48 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2024 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#pragma once
+
+#include <vector>
+
+#include "chess/position.h"
+
+namespace lczero {
+
+// A structure that is passed to Search/SearchEnvironment to provide the game
+// state. Somewhat mirrors usi `position <fen> moves ...` command.
+struct GameState {
+  Position startpos;
+  std::vector<Move> moves;
+
+  // Returns the position of the last move in the list.
+  Position CurrentPosition() const;
+  // Returns positions after all moves, including the starting and the last one.
+  std::vector<Position> GetPositions() const;
+};
+
+}  // namespace lczero
\ No newline at end of file
diff --git a/src/chess/pgn.h b/src/chess/pgn.h
index 512d4c673b..dd50ab9c98 100644
--- a/src/chess/pgn.h
+++ b/src/chess/pgn.h
@@ -33,6 +33,7 @@
 #include <cctype>
 #include <cerrno>
 #include <fstream>
+#include <optional>
 
 #include "chess/bitboard.h"
 #include "chess/board.h"
@@ -77,7 +78,7 @@ class PgnReader {
     while (GzGetLine(file, line)) {
       // Check if we have a UTF-8 BOM. If so, just ignore it.
       // Only supposed to exist in the first line, but should not matter.
-      if (line.substr(0,3) == "\xEF\xBB\xBF") line = line.substr(3);
+      if (line.substr(0, 3) == "\xEF\xBB\xBF") line = line.substr(3);
       if (!line.empty() && line.back() == '\r') line.pop_back();
       // TODO: support line breaks in tags to ensure they are properly ignored.
       if (line.empty() || line[0] == '[') {
@@ -151,9 +152,7 @@ class PgnReader {
         // Board ApplyMove wants mirrored for black, but outside code wants
         // normal, so mirror it back again.
         // Check equal to 0 since we've already added the position.
-        if ((cur_game_.size() % 2) == 0) {
-          cur_game_.back().Mirror();
-        }
+        if ((cur_game_.size() % 2) == 0) cur_game_.back().Flip();
         cur_board_.Mirror();
       }
     }
@@ -173,18 +172,18 @@ class PgnReader {
     cur_startpos_ = ChessBoard::kStartposFen;
   }
 
-  Move::Promotion PieceToPromotion(int p) {
+  static std::optional<PieceType> PieceToPieceType(int p) {
     switch (p) {
       case -1:
-        return Move::Promotion::None;
+        return std::nullopt;
       case 2:
-        return Move::Promotion::Queen;
+        return kQueen;
       case 3:
-        return Move::Promotion::Bishop;
+        return kBishop;
       case 4:
-        return Move::Promotion::Knight;
+        return kKnight;
       case 5:
-        return Move::Promotion::Rook;
+        return kRook;
       default:
         // 0 and 1 are pawn and king, which are not legal promotions, other
         // numbers don't correspond to a known piece type.
@@ -206,18 +205,14 @@ class PgnReader {
       p = 4;
     } else if (san[0] == 'R') {
       p = 5;
-    } else if (san[0] == 'O' && san.size() > 2 && san[1] == '-' &&
-               san[2] == 'O') {
+    } else if (san.substr(0, 3) == "O-O") {
       Move m;
       auto king_board = board.kings() & board.ours();
-      BoardSquare king_sq(GetLowestBit(king_board.as_int()));
-      if (san.size() > 4 && san[3] == '-' && san[4] == 'O') {
-        m = Move(BoardSquare(0, king_sq.col()),
-                 BoardSquare(0, board.castlings().our_queenside_rook()));
-      } else {
-        m = Move(BoardSquare(0, king_sq.col()),
-                 BoardSquare(0, board.castlings().our_kingside_rook()));
-      }
+      Square king_sq(File::FromIdx(GetLowestBit(king_board.as_int())), kRank1);
+      m = Move::WhiteCastling(king_sq.file(),
+                              san.substr(3, 2) == "-O"
+                                  ? board.castlings().our_queenside_rook
+                                  : board.castlings().our_kingside_rook);
       return m;
     }
     if (p != 0) idx++;
@@ -285,20 +280,29 @@ class PgnReader {
       auto plm = board.GenerateLegalMoves();
       int pr1 = -1;
       int pc1 = -1;
-      for (BoardSquare sq : searchBits) {
-        if (sr1 != -1 && sq.row() != sr1) continue;
-        if (c1 != -1 && sq.col() != c1) continue;
-        if (std::find(plm.begin(), plm.end(),
-                      Move(sq, BoardSquare(sr2, c2), PieceToPromotion(p2))) ==
-            plm.end()) {
+      for (Square sq : searchBits) {
+        if (sr1 != -1 && sq.rank().idx != sr1) continue;
+        if (c1 != -1 && sq.file().idx != c1) continue;
+        std::optional<PieceType> promotion = PieceToPieceType(p2);
+        std::optional<Square> enpassant = std::nullopt;
+        if (!board.en_passant().empty()) {
+          auto sq = *board.en_passant().begin();
+          enpassant = Square(sq.file(), kRank6);
+        }
+        Square to(File::FromIdx(c2), Rank::FromIdx(sr2));
+        Move move_to_find = promotion ? Move::WhitePromotion(sq, to, *promotion)
+                            : enpassant && *enpassant == to
+                                ? Move::WhiteEnPassant(sq, to)
+                                : Move::White(sq, to);
+        if (std::find(plm.begin(), plm.end(), move_to_find) == plm.end()) {
           continue;
         }
         if (pc1 != -1) {
           CERR << "Ambiguous!!";
           throw Exception("Opening book move seems ambiguous.");
         }
-        pr1 = sq.row();
-        pc1 = sq.col();
+        pr1 = sq.rank().idx;
+        pc1 = sq.file().idx;
       }
       if (pc1 == -1) {
         CERR << "No Match!!";
@@ -310,8 +314,19 @@ class PgnReader {
         r1 = 7 - r1;
       }
     }
-    Move m(BoardSquare(r1, c1), BoardSquare(r2, c2), PieceToPromotion(p2));
-    if (board.flipped()) m.Mirror();
+    std::optional<PieceType> promotion = PieceToPieceType(p2);
+
+    std::optional<Square> enpassant = std::nullopt;
+    if (!board.en_passant().empty()) {
+      auto sq = *board.en_passant().begin();
+      enpassant = Square(sq.file(), board.flipped() ? kRank3 : kRank6);
+    }
+    Square from(File::FromIdx(c1), Rank::FromIdx(r1));
+    Square to(File::FromIdx(c2), Rank::FromIdx(r2));
+    Move m = promotion ? Move::WhitePromotion(from, to, *promotion)
+             : enpassant && *enpassant == to ? Move::WhiteEnPassant(from, to)
+                                             : Move::White(from, to);
+    if (board.flipped()) m.Flip();
     return m;
   }
 
diff --git a/src/chess/position.cc b/src/chess/position.cc
index ec085f1e37..fe6ed51678 100644
--- a/src/chess/position.cc
+++ b/src/chess/position.cc
@@ -27,45 +27,20 @@
 
 #include "chess/position.h"
 
+#include <algorithm>
 #include <cassert>
 #include <cctype>
 #include <cstdlib>
 #include <cstring>
 
-namespace {
-// GetPieceAt returns the piece found at row, col on board or the null-char '\0'
-// in case no piece there.
-char GetPieceAt(const lczero::ChessBoard& board, int row, int col) {
-  char c = '\0';
-  if (board.ours().get(row, col) || board.theirs().get(row, col)) {
-    if (board.pawns().get(row, col)) {
-      c = 'P';
-    } else if (board.kings().get(row, col)) {
-      c = 'K';
-    } else if (board.bishops().get(row, col)) {
-      c = 'B';
-    } else if (board.queens().get(row, col)) {
-      c = 'Q';
-    } else if (board.rooks().get(row, col)) {
-      c = 'R';
-    } else {
-      c = 'N';
-    }
-    if (board.theirs().get(row, col)) {
-      c = std::tolower(c);  // Capitals are for white.
-    }
-  }
-  return c;
-}
+#include "chess/types.h"
 
-}  // namespace
 namespace lczero {
 
 Position::Position(const Position& parent, Move m)
     : rule50_ply_(parent.rule50_ply_ + 1), ply_count_(parent.ply_count_ + 1) {
-  them_board_ = parent.us_board_;
-  const bool is_zeroing = them_board_.ApplyMove(m);
-  us_board_ = them_board_;
+  us_board_ = parent.us_board_;
+  const bool is_zeroing = us_board_.ApplyMove(m);
   us_board_.Mirror();
   if (is_zeroing) rule50_ply_ = 0;
 }
@@ -73,15 +48,23 @@ Position::Position(const Position& parent, Move m)
 Position::Position(const ChessBoard& board, int rule50_ply, int game_ply)
     : rule50_ply_(rule50_ply), repetitions_(0), ply_count_(game_ply) {
   us_board_ = board;
-  them_board_ = board;
-  them_board_.Mirror();
+}
+
+Position Position::FromFen(std::string_view fen) {
+  Position pos;
+  pos.us_board_.SetFromFen(std::string(fen), &pos.rule50_ply_, &pos.ply_count_);
+  return pos;
 }
 
 uint64_t Position::Hash() const {
   return HashCat({us_board_.Hash(), static_cast<unsigned long>(repetitions_)});
 }
 
-std::string Position::DebugString() const { return us_board_.DebugString(); }
+std::string Position::DebugString() const {
+  std::string fen = PositionToFen(*this);
+  std::replace(fen.begin(), fen.end(), ' ', '_');
+  return "https://lc0.org/fen/" + fen;
+}
 
 GameResult operator-(const GameResult& res) {
   return res == GameResult::BLACK_WON   ? GameResult::WHITE_WON
@@ -114,6 +97,11 @@ void PositionHistory::Reset(const ChessBoard& board, int rule50_ply,
   positions_.emplace_back(board, rule50_ply, game_ply);
 }
 
+void PositionHistory::Reset(const Position& pos) {
+  positions_.clear();
+  positions_.push_back(pos);
+}
+
 void PositionHistory::Append(Move m) {
   // TODO(mooskagh) That should be emplace_back(Last(), m), but MSVS STL
   //                has a bug in implementation of emplace_back, when
@@ -130,7 +118,7 @@ int PositionHistory::ComputeLastMoveRepetitions(int* cycle_length) const {
   // TODO(crem) implement hash/cache based solution.
   if (last.GetRule50Ply() < 4) return 0;
 
-  for (int idx = positions_.size() - 3; idx >= 0; idx -= 2) {
+  for (int idx = positions_.size() - 5; idx >= 0; idx -= 2) {
     const auto& pos = positions_[idx];
     if (pos.GetBoard() == last.GetBoard()) {
       *cycle_length = positions_.size() - 1 - idx;
@@ -160,34 +148,8 @@ uint64_t PositionHistory::HashLast(int positions) const {
   return HashCat(hash, Last().GetRule50Ply());
 }
 
-std::string GetFen(const Position& pos) {
-  std::string result;
-  const ChessBoard& board = pos.GetWhiteBoard();
-  for (int row = 7; row >= 0; --row) {
-    int emptycounter = 0;
-    for (int col = 0; col < 8; ++col) {
-      char piece = GetPieceAt(board, row, col);
-      if (emptycounter > 0 && piece) {
-        result += std::to_string(emptycounter);
-        emptycounter = 0;
-      }
-      if (piece) {
-        result += piece;
-      } else {
-        emptycounter++;
-      }
-    }
-    if (emptycounter > 0) result += std::to_string(emptycounter);
-    if (row > 0) result += "/";
-  }
-  std::string enpassant = "-";
-  if (!board.en_passant().empty()) {
-    auto sq = *board.en_passant().begin();
-    enpassant = BoardSquare(pos.IsBlackToMove() ? 2 : 5, sq.col()).as_string();
-  }
-  result += pos.IsBlackToMove() ? " b" : " w";
-  result += " " + board.castlings().as_string();
-  result += " " + enpassant;
+std::string PositionToFen(const Position& pos) {
+  std::string result = BoardToFen(pos.GetBoard());
   result += " " + std::to_string(pos.GetRule50Ply());
   result += " " + std::to_string(
                       (pos.GetGamePly() + (pos.IsBlackToMove() ? 1 : 2)) / 2);
diff --git a/src/chess/position.h b/src/chess/position.h
index 69241467c4..fc28990c6b 100644
--- a/src/chess/position.h
+++ b/src/chess/position.h
@@ -27,7 +27,9 @@
 
 #pragma once
 
+#include <span>
 #include <string>
+#include <string_view>
 
 #include "chess/board.h"
 
@@ -35,10 +37,13 @@ namespace lczero {
 
 class Position {
  public:
+  Position() = default;
   // From parent position and move.
   Position(const Position& parent, Move m);
   // From particular position.
   Position(const ChessBoard& board, int rule50_ply, int game_ply);
+  // From fen.
+  static Position FromFen(std::string_view fen);
 
   uint64_t Hash() const;
   bool IsBlackToMove() const { return us_board_.flipped(); }
@@ -64,33 +69,28 @@ class Position {
 
   // Gets board from the point of view of player to move.
   const ChessBoard& GetBoard() const { return us_board_; }
-  // Gets board from the point of view of opponent.
-  const ChessBoard& GetThemBoard() const { return them_board_; }
-  // Gets board from the point of view of the white player.
-  const ChessBoard& GetWhiteBoard() const {
-    return us_board_.flipped() ? them_board_ : us_board_;
-  };
+
+  bool operator==(const Position&) const = default;
+  bool operator!=(const Position&) const = default;
 
   std::string DebugString() const;
 
  private:
   // The board from the point of view of the player to move.
   ChessBoard us_board_;
-  // The board from the point of view of opponent.
-  ChessBoard them_board_;
 
   // How many half-moves without capture or pawn move was there.
   int rule50_ply_ = 0;
   // How many repetitions this position had before. For new positions it's 0.
-  int repetitions_;
+  int repetitions_ = 0;
   // How many half-moves since the position was repeated or 0.
-  int cycle_length_;
+  int cycle_length_ = 0;
   // number of half-moves since beginning of the game.
   int ply_count_ = 0;
 };
 
 // GetFen returns a FEN notation for the position.
-std::string GetFen(const Position& pos);
+std::string PositionToFen(const Position& pos);
 
 // These are ordered so max() prefers the best result.
 enum class GameResult : uint8_t { UNDECIDED, BLACK_WON, DRAW, WHITE_WON };
@@ -101,9 +101,11 @@ class PositionHistory {
   PositionHistory() = default;
   PositionHistory(const PositionHistory& other) = default;
   PositionHistory(PositionHistory&& other) = default;
+  PositionHistory(std::span<const Position> positions)
+      : positions_(positions.begin(), positions.end()) {}
 
   PositionHistory& operator=(const PositionHistory& other) = default;
-  PositionHistory& operator=(PositionHistory&& other) = default;  
+  PositionHistory& operator=(PositionHistory&& other) = default;
 
   // Returns first position of the game (or fen from which it was initialized).
   const Position& Starting() const { return positions_.front(); }
@@ -128,6 +130,7 @@ class PositionHistory {
 
   // Resets the position to a given state.
   void Reset(const ChessBoard& board, int rule50_ply, int game_ply);
+  void Reset(const Position& pos);
 
   // Appends a position to history.
   void Append(Move m);
@@ -147,6 +150,8 @@ class PositionHistory {
   // Checks for any repetitions since the last time 50 move rule was reset.
   bool DidRepeatSinceLastZeroingMove() const;
 
+  std::span<const Position> GetPositions() const { return positions_; }
+
  private:
   int ComputeLastMoveRepetitions(int* cycle_length) const;
 
diff --git a/src/chess/position_test.cc b/src/chess/position_test.cc
index 69951bd048..f6dd453986 100644
--- a/src/chess/position_test.cc
+++ b/src/chess/position_test.cc
@@ -51,7 +51,7 @@ TEST(Position, SetFenGetFen) {
     history.Reset(board, no_capture_ply,
                   2 * game_move - (board.flipped() ? 1 : 2));
     Position pos = history.Last();
-    std::string target_fen = GetFen(pos);
+    std::string target_fen = PositionToFen(pos);
     EXPECT_EQ(source_fens[i], target_fen);
   }
 }
@@ -62,12 +62,12 @@ TEST(PositionHistory, ComputeLastMoveRepetitionsWithoutLegalEnPassant) {
   PositionHistory history;
   board.SetFromFen("3b4/rp1r1k2/8/1RP2p1p/p1KP4/P3P2P/5P2/1R2B3 b - - 2 30");
   history.Reset(board, 2, 30);
-  history.Append(Move("f7f8", true));
-  history.Append(Move("f2f4", false));
-  history.Append(Move("d7h7", true));
-  history.Append(Move("c4d3", false));
-  history.Append(Move("h7d7", true));
-  history.Append(Move("d3c4", false));
+  history.Append(history.Last().GetBoard().ParseMove("f7f8"));
+  history.Append(history.Last().GetBoard().ParseMove("f2f4"));
+  history.Append(history.Last().GetBoard().ParseMove("d7h7"));
+  history.Append(history.Last().GetBoard().ParseMove("c4d3"));
+  history.Append(history.Last().GetBoard().ParseMove("h7d7"));
+  history.Append(history.Last().GetBoard().ParseMove("d3c4"));
   int history_idx = history.GetLength() - 1;
   const Position& repeated_position = history.GetPositionAt(history_idx);
   EXPECT_EQ(repeated_position.GetRepetitions(), 1);
@@ -78,12 +78,12 @@ TEST(PositionHistory, ComputeLastMoveRepetitionsWithLegalEnPassant) {
   PositionHistory history;
   board.SetFromFen("3b4/rp1r1k2/8/1RP2p1p/p1KP2p1/P3P2P/5P2/1R2B3 b - - 2 30");
   history.Reset(board, 2, 30);
-  history.Append(Move("f7f8", true));
-  history.Append(Move("f2f4", false));
-  history.Append(Move("d7h7", true));
-  history.Append(Move("c4d3", false));
-  history.Append(Move("h7d7", true));
-  history.Append(Move("d3c4", false));
+  history.Append(history.Last().GetBoard().ParseMove("f7f8"));
+  history.Append(history.Last().GetBoard().ParseMove("f2f4"));
+  history.Append(history.Last().GetBoard().ParseMove("d7h7"));
+  history.Append(history.Last().GetBoard().ParseMove("c4d3"));
+  history.Append(history.Last().GetBoard().ParseMove("h7d7"));
+  history.Append(history.Last().GetBoard().ParseMove("d3c4"));
   int history_idx = history.GetLength() - 1;
   const Position& repeated_position = history.GetPositionAt(history_idx);
   EXPECT_EQ(repeated_position.GetRepetitions(), 0);
@@ -94,12 +94,12 @@ TEST(PositionHistory, DidRepeatSinceLastZeroingMoveCurent) {
   PositionHistory history;
   board.SetFromFen("3b4/rp1r1k2/8/1RP2p1p/p1KP4/P3P2P/5P2/1R2B3 b - - 2 30");
   history.Reset(board, 2, 30);
-  history.Append(Move("f7f8", true));
-  history.Append(Move("f2f4", false));
-  history.Append(Move("d7h7", true));
-  history.Append(Move("c4d3", false));
-  history.Append(Move("h7d7", true));
-  history.Append(Move("d3c4", false));
+  history.Append(history.Last().GetBoard().ParseMove("f7f8"));
+  history.Append(history.Last().GetBoard().ParseMove("f2f4"));
+  history.Append(history.Last().GetBoard().ParseMove("d7h7"));
+  history.Append(history.Last().GetBoard().ParseMove("c4d3"));
+  history.Append(history.Last().GetBoard().ParseMove("h7d7"));
+  history.Append(history.Last().GetBoard().ParseMove("d3c4"));
   EXPECT_TRUE(history.DidRepeatSinceLastZeroingMove());
 }
 
@@ -108,13 +108,13 @@ TEST(PositionHistory, DidRepeatSinceLastZeroingMoveBefore) {
   PositionHistory history;
   board.SetFromFen("3b4/rp1r1k2/8/1RP2p1p/p1KP4/P3P2P/5P2/1R2B3 b - - 2 30");
   history.Reset(board, 2, 30);
-  history.Append(Move("f7f8", true));
-  history.Append(Move("f2f4", false));
-  history.Append(Move("d7h7", true));
-  history.Append(Move("c4d3", false));
-  history.Append(Move("h7d7", true));
-  history.Append(Move("d3c4", false));
-  history.Append(Move("d7e7", true));
+  history.Append(history.Last().GetBoard().ParseMove("f7f8"));
+  history.Append(history.Last().GetBoard().ParseMove("f2f4"));
+  history.Append(history.Last().GetBoard().ParseMove("d7h7"));
+  history.Append(history.Last().GetBoard().ParseMove("c4d3"));
+  history.Append(history.Last().GetBoard().ParseMove("h7d7"));
+  history.Append(history.Last().GetBoard().ParseMove("d3c4"));
+  history.Append(history.Last().GetBoard().ParseMove("d7e7"));
   EXPECT_TRUE(history.DidRepeatSinceLastZeroingMove());
 }
 
@@ -123,14 +123,14 @@ TEST(PositionHistory, DidRepeatSinceLastZeroingMoveOlder) {
   PositionHistory history;
   board.SetFromFen("3b4/rp1r1k2/8/1RP2p1p/p1KP4/P3P2P/5P2/1R2B3 b - - 2 30");
   history.Reset(board, 2, 30);
-  history.Append(Move("f7f8", true));
-  history.Append(Move("f2f4", false));
-  history.Append(Move("d7h7", true));
-  history.Append(Move("c4d3", false));
-  history.Append(Move("h7d7", true));
-  history.Append(Move("d3c4", false));
-  history.Append(Move("d7e7", true));
-  history.Append(Move("c4b4", false));
+  history.Append(history.Last().GetBoard().ParseMove("f7f8"));
+  history.Append(history.Last().GetBoard().ParseMove("f2f4"));
+  history.Append(history.Last().GetBoard().ParseMove("d7h7"));
+  history.Append(history.Last().GetBoard().ParseMove("c4d3"));
+  history.Append(history.Last().GetBoard().ParseMove("h7d7"));
+  history.Append(history.Last().GetBoard().ParseMove("d3c4"));
+  history.Append(history.Last().GetBoard().ParseMove("d7e7"));
+  history.Append(history.Last().GetBoard().ParseMove("c4b4"));
   EXPECT_TRUE(history.DidRepeatSinceLastZeroingMove());
 }
 
@@ -139,15 +139,15 @@ TEST(PositionHistory, DidRepeatSinceLastZeroingMoveBeforeZero) {
   PositionHistory history;
   board.SetFromFen("3b4/rp1r1k2/8/1RP2p1p/p1KP4/P3P2P/5P2/1R2B3 b - - 2 30");
   history.Reset(board, 2, 30);
-  history.Append(Move("f7f8", true));
-  history.Append(Move("f2f4", false));
-  history.Append(Move("d7h7", true));
-  history.Append(Move("c4d3", false));
-  history.Append(Move("h7d7", true));
-  history.Append(Move("d3c4", false));
-  history.Append(Move("d7e7", true));
-  history.Append(Move("c4b4", false));
-  history.Append(Move("h5h4", true));
+  history.Append(history.Last().GetBoard().ParseMove("f7f8"));
+  history.Append(history.Last().GetBoard().ParseMove("f2f4"));
+  history.Append(history.Last().GetBoard().ParseMove("d7h7"));
+  history.Append(history.Last().GetBoard().ParseMove("c4d3"));
+  history.Append(history.Last().GetBoard().ParseMove("h7d7"));
+  history.Append(history.Last().GetBoard().ParseMove("d3c4"));
+  history.Append(history.Last().GetBoard().ParseMove("d7e7"));
+  history.Append(history.Last().GetBoard().ParseMove("c4b4"));
+  history.Append(history.Last().GetBoard().ParseMove("h5h4"));
   EXPECT_FALSE(history.DidRepeatSinceLastZeroingMove());
 }
 
@@ -156,8 +156,8 @@ TEST(PositionHistory, DidRepeatSinceLastZeroingMoveNeverRepeated) {
   PositionHistory history;
   board.SetFromFen("3b4/rp1r1k2/8/1RP2p1p/p1KP4/P3P2P/5P2/1R2B3 b - - 2 30");
   history.Reset(board, 2, 30);
-  history.Append(Move("f7f8", true));
-  history.Append(Move("f2f4", false));
+  history.Append(history.Last().GetBoard().ParseMove("f7f8"));
+  history.Append(history.Last().GetBoard().ParseMove("f2f4"));
   EXPECT_FALSE(history.DidRepeatSinceLastZeroingMove());
 }
 
diff --git a/src/chess/types.h b/src/chess/types.h
new file mode 100644
index 0000000000..2fce81a352
--- /dev/null
+++ b/src/chess/types.h
@@ -0,0 +1,242 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2025 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace lczero {
+
+struct PieceType {
+  uint8_t idx;
+  static constexpr PieceType FromIdx(uint8_t idx) { return PieceType{idx}; }
+  static PieceType Parse(char c);
+  std::string ToString(bool uppercase = false) const {
+    return std::string(1, "nqrbpk"[idx] + (uppercase ? 'A' - 'a' : 0));
+  }
+  bool CanPromoteInto() const { return idx < 4; }
+  bool IsValid() const { return idx < 6; }
+  bool operator==(const PieceType& other) const = default;
+  bool operator!=(const PieceType& other) const = default;
+
+ private:
+  constexpr explicit PieceType(uint8_t idx) : idx(idx) {}
+};
+
+constexpr PieceType kKnight = PieceType::FromIdx(0),
+                    kQueen = PieceType::FromIdx(1),
+                    kRook = PieceType::FromIdx(2),
+                    kBishop = PieceType::FromIdx(3),
+                    kPawn = PieceType::FromIdx(4),
+                    kKing = PieceType::FromIdx(5);
+
+struct File {
+  uint8_t idx;
+  File() : idx(0x80) {}  // Not on board.
+  constexpr bool IsValid() const { return idx < 8; }
+  static constexpr File FromIdx(uint8_t idx) { return File{idx}; }
+  static constexpr File Parse(char c) { return File(std::tolower(c) - 'a'); }
+  std::string ToString(bool uppercase = false) const {
+    return std::string(1, (uppercase ? 'A' : 'a') + idx);
+  }
+  void Flop() { idx ^= 0b111; }
+  auto operator<=>(const File& other) const = default;
+  void operator++() { ++idx; }
+  void operator--() { --idx; }
+  void operator+=(int delta) { idx += delta; }
+  File operator+(int delta) const { return File(idx + delta); }
+  File operator-(int delta) const { return File(idx - delta); }
+
+ private:
+  constexpr explicit File(uint8_t idx) : idx(idx) {}
+};
+
+constexpr File kFileA = File::FromIdx(0), kFileB = File::FromIdx(1),
+               kFileC = File::FromIdx(2), kFileD = File::FromIdx(3),
+               kFileE = File::FromIdx(4), kFileF = File::FromIdx(5),
+               kFileG = File::FromIdx(6), kFileH = File::FromIdx(7);
+
+struct Rank {
+  uint8_t idx;
+  constexpr bool IsValid() const { return idx < 8; }
+  static constexpr Rank FromIdx(uint8_t idx) { return Rank{idx}; }
+  static constexpr Rank Parse(char c) { return Rank(c - '1'); }
+  void Flip() { idx ^= 0b111; }
+  std::string ToString() const { return std::string(1, '1' + idx); }
+  auto operator<=>(const Rank& other) const = default;
+  void operator--() { --idx; }
+  void operator++() { ++idx; }
+  void operator+=(int delta) { idx += delta; }
+  Rank operator+(int delta) const { return Rank(idx + delta); }
+  Rank operator-(int delta) const { return Rank(idx - delta); }
+
+ private:
+  constexpr explicit Rank(uint8_t idx) : idx(idx) {}
+};
+
+constexpr Rank kRank1 = Rank::FromIdx(0), kRank2 = Rank::FromIdx(1),
+               kRank3 = Rank::FromIdx(2), kRank4 = Rank::FromIdx(3),
+               kRank5 = Rank::FromIdx(4), kRank6 = Rank::FromIdx(5),
+               kRank7 = Rank::FromIdx(6), kRank8 = Rank::FromIdx(7);
+
+// Stores a coordinates of a single square.
+class Square {
+ public:
+  constexpr Square() = default;
+  constexpr Square(File file, Rank rank) : idx_(rank.idx * 8 + file.idx) {}
+  static constexpr Square FromIdx(uint8_t idx) { return Square{idx}; }
+  static constexpr Square Parse(std::string_view);
+  constexpr File file() const { return File::FromIdx(idx_ % 8); }
+  constexpr Rank rank() const { return Rank::FromIdx(idx_ / 8); }
+  // Flips the ranks. 1 becomes 8, 2 becomes 7, etc. Files remain the same.
+  void Flip() { idx_ ^= 0b111000; }
+  std::string ToString(bool uppercase = false) const {
+    return file().ToString(uppercase) + rank().ToString();
+  }
+  constexpr bool operator==(const Square& other) const = default;
+  constexpr bool operator!=(const Square& other) const = default;
+  constexpr uint8_t as_idx() const { return idx_; }
+
+ private:
+  explicit constexpr Square(uint8_t idx) : idx_(idx) {}
+
+  // 0 is a1, 1 is b1, 8 is a2, 63 is h8.
+  uint8_t idx_;
+};
+
+constexpr Square kSquareA1 = Square(kFileA, kRank1),
+                 kSquareC1 = Square(kFileC, kRank1),
+                 kSquareE1 = Square(kFileE, kRank1),
+                 kSquareG1 = Square(kFileG, kRank1),
+                 kSquareH1 = Square(kFileH, kRank1);
+
+class Move {
+ public:
+  Move() = default;
+  static constexpr Move White(Square from, Square to) {
+    return Move((from.as_idx() << 6) | to.as_idx());
+  }
+  static constexpr Move WhitePromotion(Square from, Square to,
+                                       PieceType promotion_piece) {
+    return Move((from.as_idx() << 6) | to.as_idx() | kPromotion |
+                (promotion_piece.idx << 12));
+  }
+  static constexpr Move WhiteCastling(File king, File rook) {
+    return Move((king.idx << 6) | rook.idx | kCastling);
+  }
+  static constexpr Move WhiteEnPassant(Square from, Square to) {
+    return Move((from.as_idx() << 6) | to.as_idx() | kEnPassant);
+  }
+
+  bool operator==(const Move& other) const = default;
+  bool operator!=(const Move& other) const = default;
+
+  // Mirrors the ranks of the move.
+  void Flip() { data_ ^= kFlipMask; }
+  std::string ToString(bool is_chess960) const;
+
+  Square from() const { return Square::FromIdx((data_ & kFromMask) >> 6); }
+  Square to() const { return Square::FromIdx(data_ & kToMask); }
+  bool is_promotion() const { return data_ & kPromotion; }
+  PieceType promotion() const {
+    return PieceType::FromIdx((data_ & kPieceMask) >> 12);
+  }
+  bool is_castling() const { return (data_ & kSpecialMask) == kCastling; }
+  bool is_en_passant() const { return (data_ & kSpecialMask) == kEnPassant; }
+  // TODO remove this once UciReponder starts using std::optional for ponder.
+  bool is_null() const { return data_ == 0; }
+
+  uint16_t raw_data() const { return data_; }
+
+ private:
+  explicit constexpr Move(uint16_t data) : data_(data) {}
+
+  // Move encoding using 16 bits:
+  // - bits  0-5:  "to" square (6 bits)
+  // - bits  6-11: "from" square (6 bits)
+  // - bits  12-13: if is_promotion:  promotion piece type
+  //                if !is_promotion: SpecialMove
+  // - bit   14:   is_promotion flag
+  // - bit   15:   reserved (potentially for side-to-move)
+  // Castling is always encoded as a "king takes rook" move.
+  uint16_t data_ = 0;
+
+  enum Masks : uint16_t {
+    // clang-format off
+    kToMask      = 0b0000000000111111,
+    kFromMask    = 0b0000111111000000,
+    kSpecialMask = 0b0111000000000000,
+    kCastling    = 0b0001000000000000,
+    kEnPassant   = 0b0010000000000000,
+    kPromotion   = 0b0100000000000000,
+    kPieceMask   = 0b0011000000000000,
+    // If/when we have side-to-move bit, also flip it here.
+    kFlipMask    = 0b0000111000111000,
+    // clang-format on
+  };
+};
+
+inline int operator-(File a, File b) { return static_cast<int>(a.idx) - b.idx; }
+inline int operator-(Rank a, Rank b) { return static_cast<int>(a.idx) - b.idx; }
+
+inline constexpr Square Square::Parse(std::string_view str) {
+  return Square(File::Parse(str[0]), Rank::Parse(str[1]));
+}
+
+inline PieceType PieceType::Parse(char c) {
+  switch (tolower(c)) {
+    case 'n':
+      return kKnight;
+    case 'q':
+      return kQueen;
+    case 'r':
+      return kRook;
+    case 'b':
+      return kBishop;
+    case 'p':
+      return kPawn;
+    case 'k':
+      return kKing;
+    default:
+      return PieceType{6};
+  }
+}
+
+inline std::string Move::ToString(bool is_chess960) const {
+  if (is_castling() && !is_chess960) {
+    return from().ToString() + (to().file() > from().file() ? "g" : "c") +
+           to().rank().ToString();
+  }
+  return from().ToString() + to().ToString() +
+         (is_promotion() ? promotion().ToString(false) : "");
+}
+
+using MoveList = std::vector<Move>;
+
+}  // namespace lczero
\ No newline at end of file
diff --git a/src/chess/uciloop.cc b/src/chess/uciloop.cc
index 4135895965..398a8bd7bd 100644
--- a/src/chess/uciloop.cc
+++ b/src/chess/uciloop.cc
@@ -43,24 +43,43 @@
 #include "version.h"
 
 namespace lczero {
-
 namespace {
+
+const OptionId kUciChess960{
+    {.long_flag = "chess960",
+     .uci_option = "UCI_Chess960",
+     .help_text = "Castling moves are encoded as \"king takes rook\".",
+     .visibility = OptionId::kAlwaysVisible}};
+const OptionId kShowWDL{{.long_flag = "show-wdl",
+                         .uci_option = "UCI_ShowWDL",
+                         .help_text = "Show win, draw and lose probability.",
+                         .visibility = OptionId::kAlwaysVisible}};
+const OptionId kShowEPS{
+    {.long_flag = "show-eps",
+     .uci_option = "UCI_ShowEPS",
+     .help_text = "Show neural network evaluations per second.",
+     .visibility = OptionId::kAlwaysVisible}};
+const OptionId kShowMovesleft{{.long_flag = "show-movesleft",
+                               .uci_option = "UCI_ShowMovesLeft",
+                               .help_text = "Show estimated moves left.",
+                               .visibility = OptionId::kAlwaysVisible}};
+
 const std::unordered_map<std::string, std::unordered_set<std::string>>
     kKnownCommands = {
         {{"uci"}, {}},
         {{"isready"}, {}},
-        {{"setoption"}, {"context", "name", "value"}},
+        {{"setoption"}, {"name", "value"}},
         {{"ucinewgame"}, {}},
         {{"position"}, {"fen", "startpos", "moves"}},
         {{"go"},
          {"infinite", "wtime", "btime", "winc", "binc", "movestogo", "depth",
           "mate", "nodes", "movetime", "searchmoves", "ponder"}},
-        {{"start"}, {}},
         {{"stop"}, {}},
         {{"ponderhit"}, {}},
         {{"quit"}, {}},
         {{"xyzzy"}, {}},
         {{"fen"}, {}},
+        {{"wait"}, {}}
 };
 
 std::pair<std::string, std::unordered_map<std::string, std::string>>
@@ -80,6 +99,26 @@ ParseCommand(const std::string& line) {
     throw Exception("Unknown command: " + line);
   }
 
+  // Special parsing for setoption to keep strings unmodified.
+  if (command->first == "setoption") {
+    iss >> token;
+    if (token != "name") {
+      throw Exception("setoption must be followed by name");
+    }
+    int name_pos = iss.eof() ? line.length() : static_cast<int>(iss.tellg());
+    std::optional<int> value_pos;
+    while (iss >> token) {
+      if (token == "value") {
+        value_pos = iss.eof() ? line.length() : static_cast<int>(iss.tellg());
+        params["value"] = Trim(line.substr(*value_pos));
+        break;
+      }
+    }
+    params["name"] = Trim(line.substr(
+        name_pos, value_pos ? *value_pos - name_pos - 5 : std::string::npos));
+    return {"setoption", params};
+  }
+
   std::string whitespace;
   while (iss >> token) {
     auto iter = command->second.find(token);
@@ -125,45 +164,47 @@ int GetNumeric(const std::unordered_map<std::string, std::string>& params,
 
 bool ContainsKey(const std::unordered_map<std::string, std::string>& params,
                  const std::string& key) {
-  return params.find(key) != params.end();
+  return params.contains(key);
 }
 }  // namespace
 
-void UciLoop::RunLoop() {
-  std::cout.setf(std::ios::unitbuf);
-  std::string line;
-  while (std::getline(std::cin, line)) {
-    LOGFILE << ">> " << line;
-    try {
-      auto command = ParseCommand(line);
-      // Ignore empty line.
-      if (command.first.empty()) continue;
-      if (!DispatchCommand(command.first, command.second)) break;
-    } catch (Exception& ex) {
-      SendResponse(std::string("error ") + ex.what());
-    }
-  }
+UciLoop::UciLoop(StringUciResponder* uci_responder, OptionsParser* options,
+                 EngineControllerBase* engine)
+    : uci_responder_(uci_responder), options_(options), engine_(engine) {
+  engine_->RegisterUciResponder(uci_responder_);
 }
 
+UciLoop::~UciLoop() { engine_->UnregisterUciResponder(uci_responder_); }
+
 bool UciLoop::DispatchCommand(
     const std::string& command,
     const std::unordered_map<std::string, std::string>& params) {
   if (command == "uci") {
-    CmdUci();
+    uci_responder_->SendId();
+    for (const auto& option : options_->ListOptionsUci()) {
+      uci_responder_->SendRawResponse(option);
+    }
+    uci_responder_->SendRawResponse("uciok");
   } else if (command == "isready") {
-    CmdIsReady();
+    engine_->EnsureReady();
+    uci_responder_->SendRawResponse("readyok");
   } else if (command == "setoption") {
-    CmdSetOption(GetOrEmpty(params, "name"), GetOrEmpty(params, "value"),
-                 GetOrEmpty(params, "context"));
+    if (GetOrEmpty(params, "name").empty()) {
+      throw Exception("setoption requires name");
+    } else {
+      options_->SetUciOption(GetOrEmpty(params, "name"),
+                             GetOrEmpty(params, "value"));
+    }
   } else if (command == "ucinewgame") {
-    CmdUciNewGame();
+    engine_->NewGame();
   } else if (command == "position") {
     if (ContainsKey(params, "fen") == ContainsKey(params, "startpos")) {
       throw Exception("Position requires either fen or startpos");
     }
     const std::vector<std::string> moves =
         StrSplitAtWhitespace(GetOrEmpty(params, "moves"));
-    CmdPosition(GetOrEmpty(params, "fen"), moves);
+    const std::string fen = GetOrEmpty(params, "fen");
+    engine_->SetPosition(fen.empty() ? ChessBoard::kStartposFen : fen, moves);
   } else if (command == "go") {
     GoParams go_params;
     if (ContainsKey(params, "infinite")) {
@@ -196,17 +237,15 @@ bool UciLoop::DispatchCommand(
     UCIGOOPTION(nodes);
     UCIGOOPTION(movetime);
 #undef UCIGOOPTION
-    CmdGo(go_params);
+    engine_->Go(go_params);
+  } else if (command == "wait") {
+    engine_->Wait();
   } else if (command == "stop") {
-    CmdStop();
+    engine_->Stop();
   } else if (command == "ponderhit") {
-    CmdPonderHit();
-  } else if (command == "start") {
-    CmdStart();
-  } else if (command == "fen") {
-    CmdFen();
+    engine_->PonderHit();
   } else if (command == "xyzzy") {
-    SendResponse("Nothing happens.");
+    uci_responder_->SendRawResponse("Nothing happens.");
   } else if (command == "quit") {
     return false;
   } else {
@@ -215,37 +254,49 @@ bool UciLoop::DispatchCommand(
   return true;
 }
 
-void UciLoop::SendResponse(const std::string& response) {
-  SendResponses({response});
+bool UciLoop::ProcessLine(const std::string& line) {
+  auto command = ParseCommand(line);
+  // Ignore empty line.
+  if (command.first.empty()) return true;
+  return DispatchCommand(command.first, command.second);
 }
 
-void UciLoop::SendResponses(const std::vector<std::string>& responses) {
-  static std::mutex output_mutex;
-  std::lock_guard<std::mutex> lock(output_mutex);
-  for (auto& response : responses) {
-    LOGFILE << "<< " << response;
-    std::cout << response << std::endl;
-  }
+void StringUciResponder::PopulateParams(OptionsParser* options) {
+  options->Add<BoolOption>(kUciChess960) = false;
+  options->Add<BoolOption>(kShowWDL) = false;
+  options->Add<BoolOption>(kShowEPS) = false;
+  options->Add<BoolOption>(kShowMovesleft) = false;
+  options_ = &options->GetOptionsDict();
+}
+
+bool StringUciResponder::IsChess960() const {
+  return options_ ? options_->Get<bool>(kUciChess960) : false;
+}
+
+void StringUciResponder::SendRawResponse(const std::string& response) {
+  SendRawResponses({response});
 }
 
-void UciLoop::SendId() {
-  SendResponse("id name Lc0 v" + GetVersionStr());
-  SendResponse("id author The LCZero Authors.");
+void StringUciResponder::SendId() {
+  SendRawResponse("id name Lc0 v" + GetVersionStr());
+  SendRawResponse("id author The LCZero Authors.");
 }
 
-void UciLoop::SendBestMove(const BestMoveInfo& move) {
-  std::string res = "bestmove " + move.bestmove.as_string();
-  if (move.ponder) res += " ponder " + move.ponder.as_string();
-  if (move.player != -1) res += " player " + std::to_string(move.player);
-  if (move.game_id != -1) res += " gameid " + std::to_string(move.game_id);
-  if (move.is_black)
-    res += " side " + std::string(*move.is_black ? "black" : "white");
-  SendResponse(res);
+void StringUciResponder::OutputBestMove(BestMoveInfo* info) {
+  const bool c960 = IsChess960();
+  std::string res = "bestmove " + info->bestmove.ToString(c960);
+  if (!info->ponder.is_null()) res += " ponder " + info->ponder.ToString(c960);
+  if (info->player != -1) res += " player " + std::to_string(info->player);
+  if (info->game_id != -1) res += " gameid " + std::to_string(info->game_id);
+  if (info->is_black)
+    res += " side " + std::string(*info->is_black ? "black" : "white");
+  SendRawResponse(res);
 }
 
-void UciLoop::SendInfo(const std::vector<ThinkingInfo>& infos) {
+void StringUciResponder::OutputThinkingInfo(std::vector<ThinkingInfo>* infos) {
   std::vector<std::string> reses;
-  for (const auto& info : infos) {
+  const bool c960 = IsChess960();
+  for (const auto& info : *infos) {
     std::string res = "info";
     if (info.player != -1) res += " player " + std::to_string(info.player);
     if (info.game_id != -1) res += " gameid " + std::to_string(info.game_id);
@@ -258,26 +309,39 @@ void UciLoop::SendInfo(const std::vector<ThinkingInfo>& infos) {
     if (info.nodes >= 0) res += " nodes " + std::to_string(info.nodes);
     if (info.mate) res += " score mate " + std::to_string(*info.mate);
     if (info.score) res += " score cp " + std::to_string(*info.score);
-    if (info.wdl) {
+    if (info.wdl && options_ && options_->Get<bool>(kShowWDL)) {
       res += " wdl " + std::to_string(info.wdl->w) + " " +
              std::to_string(info.wdl->d) + " " + std::to_string(info.wdl->l);
     }
-    if (info.moves_left) {
+    if (info.moves_left && options_ && options_->Get<bool>(kShowMovesleft)) {
       res += " movesleft " + std::to_string(*info.moves_left);
     }
     if (info.hashfull >= 0) res += " hashfull " + std::to_string(info.hashfull);
     if (info.nps >= 0) res += " nps " + std::to_string(info.nps);
+    if (info.eps >= 0 && options_ && options_->Get<bool>(kShowEPS)) {
+      res += " eps " + std::to_string(info.eps);
+    }
     if (info.tb_hits >= 0) res += " tbhits " + std::to_string(info.tb_hits);
     if (info.multipv >= 0) res += " multipv " + std::to_string(info.multipv);
 
     if (!info.pv.empty()) {
       res += " pv";
-      for (const auto& move : info.pv) res += " " + move.as_string();
+      for (const auto& move : info.pv) res += " " + move.ToString(c960);
     }
     if (!info.comment.empty()) res += " string " + info.comment;
     reses.push_back(std::move(res));
   }
-  SendResponses(reses);
+  SendRawResponses(reses);
+}
+
+void StdoutUciResponder::SendRawResponses(
+    const std::vector<std::string>& responses) {
+  static std::mutex output_mutex;
+  std::lock_guard<std::mutex> lock(output_mutex);
+  for (auto& response : responses) {
+    LOGFILE << "<< " << response;
+    std::cout << response << std::endl;
+  }
 }
 
 }  // namespace lczero
diff --git a/src/chess/uciloop.h b/src/chess/uciloop.h
index be83088f16..4b8b2fd751 100644
--- a/src/chess/uciloop.h
+++ b/src/chess/uciloop.h
@@ -35,62 +35,93 @@
 
 #include "chess/callbacks.h"
 #include "utils/exception.h"
+#include "utils/optionsparser.h"
 
 namespace lczero {
 
 struct GoParams {
-  std::optional<std::int64_t> wtime;
-  std::optional<std::int64_t> btime;
-  std::optional<std::int64_t> winc;
-  std::optional<std::int64_t> binc;
-  std::optional<int> movestogo;
-  std::optional<int> depth;
-  std::optional<int> mate;
-  std::optional<int> nodes;
-  std::optional<std::int64_t> movetime;
+  std::optional<std::int64_t> wtime = std::nullopt;
+  std::optional<std::int64_t> btime = std::nullopt;
+  std::optional<std::int64_t> winc = std::nullopt;
+  std::optional<std::int64_t> binc = std::nullopt;
+  std::optional<int> movestogo = std::nullopt;
+  std::optional<int> depth = std::nullopt;
+  std::optional<int> mate = std::nullopt;
+  std::optional<int> nodes = std::nullopt;
+  std::optional<std::int64_t> movetime = std::nullopt;
   bool infinite = false;
-  std::vector<std::string> searchmoves;
+  std::vector<std::string> searchmoves = {};
   bool ponder = false;
 };
 
-class UciLoop {
+class StringUciResponder : public UciResponder {
  public:
-  virtual ~UciLoop() {}
-  virtual void RunLoop();
+  void PopulateParams(OptionsParser* options);
 
-  // Sends response to host.
-  void SendResponse(const std::string& response);
-  // Sends responses to host ensuring they are received as a block.
-  virtual void SendResponses(const std::vector<std::string>& responses);
-  void SendBestMove(const BestMoveInfo& move);
-  void SendInfo(const std::vector<ThinkingInfo>& infos);
   void SendId();
+  void OutputBestMove(BestMoveInfo* info) override;
+  void OutputThinkingInfo(std::vector<ThinkingInfo>* infos) override;
 
-  // Command handlers.
-  virtual void CmdUci() { throw Exception("Not supported"); }
-  virtual void CmdIsReady() { throw Exception("Not supported"); }
-  virtual void CmdSetOption(const std::string& /*name*/,
-                            const std::string& /*value*/,
-                            const std::string& /*context*/) {
-    throw Exception("Not supported");
-  }
-  virtual void CmdUciNewGame() { throw Exception("Not supported"); }
-  virtual void CmdPosition(const std::string& /*position*/,
-                           const std::vector<std::string>& /*moves*/) {
-    throw Exception("Not supported");
-  }
-  virtual void CmdFen() { throw Exception("Not supported"); }
-  virtual void CmdGo(const GoParams& /*params*/) {
-    throw Exception("Not supported");
-  }
-  virtual void CmdStop() { throw Exception("Not supported"); }
-  virtual void CmdPonderHit() { throw Exception("Not supported"); }
-  virtual void CmdStart() { throw Exception("Not supported"); }
+  // Sends response to host.
+  void SendRawResponse(const std::string& response);
+  // Sends responses to host ensuring they are received as a block.
+  virtual void SendRawResponses(const std::vector<std::string>& responses) = 0;
 
  private:
+  bool IsChess960() const;
+
+  const OptionsDict* options_ = nullptr;  // absl_nullable
+};
+
+class EngineControllerBase {
+ public:
+  virtual ~EngineControllerBase() = default;
+
+  // Blocks.
+  virtual void EnsureReady() = 0;
+
+  // Must not block.
+  virtual void NewGame() = 0;
+
+  // Blocks.
+  virtual void SetPosition(const std::string& fen,
+                           const std::vector<std::string>& moves) = 0;
+
+  // Must not block.
+  virtual void Go(const GoParams& params) = 0;
+  virtual void PonderHit() = 0;
+  // Can block
+  virtual void Wait() = 0;
+  // Must not block.
+  virtual void Stop() = 0;
+
+  // Register and unregister the UCI responder using observer pattern.
+  virtual void RegisterUciResponder(UciResponder*) = 0;
+  virtual void UnregisterUciResponder(UciResponder*) = 0;
+};
+
+class UciLoop {
+ public:
+  UciLoop(StringUciResponder* uci_responder, OptionsParser* options,
+          EngineControllerBase* engine);
+  virtual ~UciLoop();
+
+  // Returns false if the loop should stop.
+  bool ProcessLine(const std::string& line);
+
+ protected:
   bool DispatchCommand(
       const std::string& command,
       const std::unordered_map<std::string, std::string>& params);
+
+  StringUciResponder* uci_responder_;  // absl_nonnull
+  OptionsParser* options_;             // absl_notnull
+  EngineControllerBase* engine_;       // absl_notnull
+};
+
+class StdoutUciResponder : public StringUciResponder {
+ public:
+  void SendRawResponses(const std::vector<std::string>& responses) override;
 };
 
 }  // namespace lczero
diff --git a/src/engine.cc b/src/engine.cc
index e4157657c6..c4c487c020 100644
--- a/src/engine.cc
+++ b/src/engine.cc
@@ -1,6 +1,6 @@
 /*
   This file is part of Leela Chess Zero.
-  Copyright (C) 2018-2019 The LCZero Authors
+  Copyright (C) 2024 The LCZero Authors
 
   Leela Chess is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -28,447 +28,250 @@
 #include "engine.h"
 
 #include <algorithm>
-#include <cmath>
-#include <functional>
 
-#include "mcts/search.h"
-#include "mcts/stoppers/factory.h"
-#include "utils/commandline.h"
-#include "utils/configfile.h"
-#include "utils/logging.h"
+#include "chess/position.h"
+#include "neural/backend.h"
+#include "neural/memcache.h"
+#include "neural/register.h"
+#include "neural/shared_params.h"
+#include "syzygy/syzygy.h"
 
 namespace lczero {
 namespace {
-const OptionId kThreadsOptionId{
-    "threads", "Threads",
-    "Number of (CPU) worker threads to use, 0 for the backend default.", 't'};
-const OptionId kLogFileId{"logfile", "LogFile",
-                          "Write log to that file. Special value <stderr> to "
-                          "output the log to the console.",
-                          'l'};
 const OptionId kSyzygyTablebaseId{
-    "syzygy-paths", "SyzygyPath",
-    "List of Syzygy tablebase directories, list entries separated by system "
-    "separator (\";\" for Windows, \":\" for Linux).",
-    's'};
-const OptionId kPonderId{"", "Ponder",
-                         "This option is ignored. Here to please chess GUIs."};
-const OptionId kUciChess960{
-    "chess960", "UCI_Chess960",
-    "Castling moves are encoded as \"king takes rook\"."};
-const OptionId kShowWDL{"show-wdl", "UCI_ShowWDL",
-                        "Show win, draw and lose probability."};
-const OptionId kShowMovesleft{"show-movesleft", "UCI_ShowMovesLeft",
-                              "Show estimated moves left."};
-const OptionId kStrictUciTiming{"strict-uci-timing", "StrictTiming",
-                                "The UCI host compensates for lag, waits for "
-                                "the 'readyok' reply before sending 'go' and "
-                                "only then starts timing."};
+    {.long_flag = "syzygy-paths",
+     .uci_option = "SyzygyPath",
+     .help_text =
+         "List of Syzygy tablebase directories, list entries separated by "
+         "system separator (\";\" for Windows, \":\" for Linux).",
+     .short_flag = 's',
+     .visibility = OptionId::kAlwaysVisible}};
+const OptionId kStrictUciTiming{
+    {.long_flag = "strict-uci-timing",
+     .uci_option = "StrictTiming",
+     .help_text = "The UCI host compensates for lag, waits for the 'readyok' "
+                  "reply before sending 'go' and only then starts timing.",
+     .visibility = OptionId::kProOnly}};
+const OptionId kPonderId{
+    {.long_flag = "",
+     .uci_option = "Ponder",
+     .help_text =
+         "Indicates to the engine that it will be requested to ponder. This "
+         "postpones resetting the search tree until the search is started.",
+     .visibility = OptionId::kAlwaysVisible}};
+
 const OptionId kPreload{"preload", "",
                         "Initialize backend and load net on engine startup."};
-const OptionId kValueOnly{
-    "value-only", "ValueOnly",
-    "In value only mode all search parameters are ignored and the position is "
-    "evaluated by getting the valuation of every child position and choosing "
-    "the worst for the opponent."};
-const OptionId kClearTree{"", "ClearTree",
-                          "Clear the tree before the next search."};
-
-MoveList StringsToMovelist(const std::vector<std::string>& moves,
-                           const ChessBoard& board) {
-  MoveList result;
-  if (moves.size()) {
-    result.reserve(moves.size());
-    const auto legal_moves = board.GenerateLegalMoves();
-    const auto end = legal_moves.end();
-    for (const auto& move : moves) {
-      const auto m = board.GetModernMove({move, board.flipped()});
-      if (std::find(legal_moves.begin(), end, m) != end) result.emplace_back(m);
-    }
-    if (result.empty()) throw Exception("No legal searchmoves.");
-  }
-  return result;
-}
-
 }  // namespace
 
-EngineController::EngineController(std::unique_ptr<UciResponder> uci_responder,
-                                   const OptionsDict& options)
-    : options_(options),
-      uci_responder_(std::move(uci_responder)),
-      current_position_{ChessBoard::kStartposFen, {}} {}
-
-void EngineController::PopulateOptions(OptionsParser* options) {
-  using namespace std::placeholders;
-  const bool is_simple =
-      CommandLine::BinaryName().find("simple") != std::string::npos;
-  NetworkFactory::PopulateOptions(options);
-  options->Add<IntOption>(kThreadsOptionId, 0, 128) = 0;
-  options->Add<IntOption>(kNNCacheSizeId, 0, 999999999) = 2000000;
-  SearchParams::Populate(options);
-
-  ConfigFile::PopulateOptions(options);
-  if (is_simple) {
-    options->HideAllOptions();
-    options->UnhideOption(kThreadsOptionId);
-    options->UnhideOption(NetworkFactory::kWeightsId);
-    options->UnhideOption(SearchParams::kContemptId);
-    options->UnhideOption(SearchParams::kMultiPvId);
-  }
+void Engine::PopulateOptions(OptionsParser* options) {
+  options->Add<BoolOption>(kPonderId) = false;
   options->Add<StringOption>(kSyzygyTablebaseId);
-  // Add "Ponder" option to signal to GUIs that we support pondering.
-  // This option is currently not used by lc0 in any way.
-  options->Add<BoolOption>(kPonderId) = true;
-  options->Add<BoolOption>(kUciChess960) = false;
-  options->Add<BoolOption>(kShowWDL) = false;
-  options->Add<BoolOption>(kShowMovesleft) = false;
-
-  PopulateTimeManagementOptions(is_simple ? RunType::kSimpleUci : RunType::kUci,
-                                options);
-
   options->Add<BoolOption>(kStrictUciTiming) = false;
-  options->HideOption(kStrictUciTiming);
-
   options->Add<BoolOption>(kPreload) = false;
-  options->Add<BoolOption>(kValueOnly) = false;
-  options->Add<ButtonOption>(kClearTree);
-  options->HideOption(kClearTree);
 }
 
-void EngineController::ResetMoveTimer() {
-  move_start_time_ = std::chrono::steady_clock::now();
+namespace {
+GameState MakeGameState(const std::string& fen,
+                        const std::vector<std::string>& moves) {
+  GameState state;
+  state.startpos = Position::FromFen(fen);
+  ChessBoard cur_board = state.startpos.GetBoard();
+  state.moves.reserve(moves.size());
+  for (const auto& move : moves) {
+    Move m = cur_board.ParseMove(move);
+    state.moves.push_back(m);
+    cur_board.ApplyMove(m);
+    cur_board.Mirror();
+  }
+  return state;
 }
+}  // namespace
 
-// Updates values from Uci options.
-void EngineController::UpdateFromUciOptions() {
-  SharedLock lock(busy_mutex_);
+class Engine::UciPonderForwarder : public UciResponder {
+ public:
+  UciPonderForwarder(Engine* engine) : engine_(engine) {}
 
-  // Syzygy tablebases.
-  std::string tb_paths = options_.Get<std::string>(kSyzygyTablebaseId);
-  if (!tb_paths.empty() && tb_paths != tb_paths_) {
-    syzygy_tb_ = std::make_unique<SyzygyTablebase>();
-    CERR << "Loading Syzygy tablebases from " << tb_paths;
-    if (!syzygy_tb_->init(tb_paths)) {
-      CERR << "Failed to load Syzygy tablebases!";
-      syzygy_tb_ = nullptr;
-    }
-    tb_paths_ = tb_paths;
-  } else if (tb_paths.empty()) {
-    syzygy_tb_ = nullptr;
-    tb_paths_.clear();
+  void OutputBestMove(BestMoveInfo* info) override {
+    if (!wrapped_) return;
+    wrapped_->OutputBestMove(info);
   }
-
-  // Network.
-  const auto network_configuration =
-      NetworkFactory::BackendConfiguration(options_);
-  if (network_configuration_ != network_configuration) {
-    network_ = NetworkFactory::LoadNetwork(options_);
-    network_configuration_ = network_configuration;
+  void OutputThinkingInfo(std::vector<ThinkingInfo>* infos) override {
+    if (!wrapped_) return;
+    if (engine_->last_go_params_ && engine_->last_go_params_->ponder) {
+      assert(engine_->last_position_ &&
+             !engine_->last_position_->moves.empty());
+      const Move ponder_move_ = engine_->last_position_->moves.back();
+      // Output all stats from main variation (not necessary the ponder move)
+      // but PV only from ponder move.
+      ThinkingInfo ponder_info;
+      for (const auto& info : *infos) {
+        if (info.multipv <= 1) {
+          ponder_info = info;
+          if (ponder_info.mate) ponder_info.mate = -*ponder_info.mate;
+          if (ponder_info.score) ponder_info.score = -*ponder_info.score;
+          if (ponder_info.depth > 1) ponder_info.depth--;
+          if (ponder_info.seldepth > 1) ponder_info.seldepth--;
+          if (ponder_info.wdl)
+            std::swap(ponder_info.wdl->w, ponder_info.wdl->l);
+          ponder_info.pv.clear();
+        }
+        if (!info.pv.empty() && info.pv[0] == ponder_move_) {
+          ponder_info.pv.assign(info.pv.begin() + 1, info.pv.end());
+        }
+      }
+      infos->clear();
+      infos->push_back(ponder_info);
+    }
+    wrapped_->OutputThinkingInfo(infos);
   }
 
-  // Cache size.
-  cache_.SetCapacity(options_.Get<int>(kNNCacheSizeId));
-
-  // Check whether we can update the move timer in "Go".
-  strict_uci_timing_ = options_.Get<bool>(kStrictUciTiming);
-}
-
-void EngineController::EnsureReady() {
-  std::unique_lock<RpSharedMutex> lock(busy_mutex_);
-  // If a UCI host is waiting for our ready response, we can consider the move
-  // not started until we're done ensuring ready.
-  ResetMoveTimer();
-}
-
-void EngineController::NewGame() {
-  // In case anything relies upon defaulting to default position and just calls
-  // newgame and goes straight into go.
-  ResetMoveTimer();
-  SharedLock lock(busy_mutex_);
-  cache_.Clear();
-  search_.reset();
-  tree_.reset();
-  CreateFreshTimeManager();
-  current_position_ = {ChessBoard::kStartposFen, {}};
-  UpdateFromUciOptions();
-}
-
-void EngineController::SetPosition(const std::string& fen,
-                                   const std::vector<std::string>& moves_str) {
-  // Some UCI hosts just call position then immediately call go, while starting
-  // the clock on calling 'position'.
-  ResetMoveTimer();
-  SharedLock lock(busy_mutex_);
-  current_position_ = CurrentPosition{fen, moves_str};
-  search_.reset();
-}
-
-Position EngineController::ApplyPositionMoves() {
-  ChessBoard board;
-  int no_capture_ply;
-  int game_move;
-  board.SetFromFen(current_position_.fen, &no_capture_ply, &game_move);
-  int game_ply = 2 * game_move - (board.flipped() ? 1 : 2);
-  Position pos(board, no_capture_ply, game_ply);
-  for (std::string move_str : current_position_.moves) {
-    Move move(move_str);
-    if (pos.IsBlackToMove()) move.Mirror();
-    pos = Position(pos, move);
+  void Register(UciResponder* wrapped) {
+    if (wrapped_) {
+      throw Exception("UciPonderForwarder already has a wrapped responder");
+    }
+    wrapped_ = wrapped;
   }
-  return pos;
-}
-
-void EngineController::SetupPosition(
-    const std::string& fen, const std::vector<std::string>& moves_str) {
-  SharedLock lock(busy_mutex_);
-  search_.reset();
-
-  UpdateFromUciOptions();
-
-  if (!tree_) tree_ = std::make_unique<NodeTree>();
-
-  std::vector<Move> moves;
-  for (const auto& move : moves_str) moves.emplace_back(move);
-  const bool is_same_game = tree_->ResetToPosition(fen, moves);
-  if (!is_same_game) CreateFreshTimeManager();
-}
-
-void EngineController::CreateFreshTimeManager() {
-  time_manager_ = MakeTimeManager(options_);
-}
-
-namespace {
-
-class PonderResponseTransformer : public TransformingUciResponder {
- public:
-  PonderResponseTransformer(std::unique_ptr<UciResponder> parent,
-                            std::string ponder_move)
-      : TransformingUciResponder(std::move(parent)),
-        ponder_move_(std::move(ponder_move)) {}
-
-  void TransformThinkingInfo(std::vector<ThinkingInfo>* infos) override {
-    // Output all stats from main variation (not necessary the ponder move)
-    // but PV only from ponder move.
-    ThinkingInfo ponder_info;
-    for (const auto& info : *infos) {
-      if (info.multipv <= 1) {
-        ponder_info = info;
-        if (ponder_info.mate) ponder_info.mate = -*ponder_info.mate;
-        if (ponder_info.score) ponder_info.score = -*ponder_info.score;
-        if (ponder_info.depth > 1) ponder_info.depth--;
-        if (ponder_info.seldepth > 1) ponder_info.seldepth--;
-        if (ponder_info.wdl) std::swap(ponder_info.wdl->w, ponder_info.wdl->l);
-        ponder_info.pv.clear();
-      }
-      if (!info.pv.empty() && info.pv[0].as_string() == ponder_move_) {
-        ponder_info.pv.assign(info.pv.begin() + 1, info.pv.end());
-      }
+  void Unregister(UciResponder* wrapped) {
+    if (wrapped_ != wrapped) {
+      throw Exception("UciPonderForwarder doesn't have this wrapped responder");
     }
-    infos->clear();
-    infos->push_back(ponder_info);
+    wrapped_ = nullptr;
   }
 
  private:
-  std::string ponder_move_;
+  UciResponder* wrapped_ = nullptr;
+  Engine* const engine_;
 };
 
-void ValueOnlyGo(NodeTree* tree, Network* network, const OptionsDict& options,
-                 std::unique_ptr<UciResponder> responder) {
-  auto input_format = network->GetCapabilities().input_format;
-
-  const auto& board = tree->GetPositionHistory().Last().GetBoard();
-  auto legal_moves = board.GenerateLegalMoves();
-  tree->GetCurrentHead()->CreateEdges(legal_moves);
-  PositionHistory history = tree->GetPositionHistory();
-  std::vector<InputPlanes> planes;
-  for (auto edge : tree->GetCurrentHead()->Edges()) {
-    history.Append(edge.GetMove());
-    if (history.ComputeGameResult() == GameResult::UNDECIDED) {
-      planes.emplace_back(EncodePositionForNN(
-          input_format, history, 8, FillEmptyHistory::FEN_ONLY, nullptr));
-    }
-    history.Pop();
+Engine::Engine(const SearchFactory& factory, const OptionsDict& opts)
+    : uci_forwarder_(std::make_unique<UciPonderForwarder>(this)),
+      options_(opts),
+      search_(factory.CreateSearch(uci_forwarder_.get(), &options_)) {
+  if (options_.Get<bool>(kPreload)) {
+    UpdateBackendConfig();
+    EnsureSyzygyTablebasesLoaded();
   }
+}
 
-  std::vector<float> comp_q;
-  int batch_size = options.Get<int>(SearchParams::kMiniBatchSizeId);
-  if (batch_size == 0) batch_size = network->GetMiniBatchSize();
-
-  for (size_t i = 0; i < planes.size(); i += batch_size) {
-    auto comp = network->NewComputation();
-    for (int j = 0; j < batch_size; j++) {
-      comp->AddInput(std::move(planes[i + j]));
-      if (i + j + 1 == planes.size()) break;
-    }
-    comp->ComputeBlocking();
+Engine::~Engine() { EnsureSearchStopped(); }
 
-    for (int j = 0; j < batch_size; j++) comp_q.push_back(comp->GetQVal(j));
-  }
+void Engine::EnsureSearchStopped() {
+  search_->AbortSearch();
+  search_->WaitSearch();
+}
 
-  Move best;
-  int comp_idx = 0;
-  float max_q = std::numeric_limits<float>::lowest();
-  for (auto edge : tree->GetCurrentHead()->Edges()) {
-    history.Append(edge.GetMove());
-    auto result = history.ComputeGameResult();
-    float q = -1;
-    if (result == GameResult::UNDECIDED) {
-      // NN eval is for side to move perspective - so if its good, its bad for
-      // us.
-      q = -comp_q[comp_idx];
-      comp_idx++;
-    } else if (result == GameResult::DRAW) {
-      q = 0;
-    } else {
-      // A legal move to a non-drawn terminal without tablebases must be a
-      // win.
-      q = 1;
-    }
-    if (q >= max_q) {
-      max_q = q;
-      best = edge.GetMove(tree->GetPositionHistory().IsBlackToMove());
-    }
-    history.Pop();
+void Engine::UpdateBackendConfig() {
+  LOGFILE << "Update backend configuration.";
+  const std::string backend_name =
+      options_.Get<std::string>(SharedBackendParams::kBackendId);
+  if (!backend_ || backend_name != backend_name_ ||
+      backend_->UpdateConfiguration(options_) == Backend::NEED_RESTART) {
+    backend_name_ = backend_name;
+    backend_ = CreateMemCache(BackendManager::Get()->CreateFromParams(options_),
+                              options_);
+    search_->SetBackend(backend_.get());
+  } else {
+    backend_->SetCacheSize(
+        options_.Get<int>(SharedBackendParams::kNNCacheSizeId));
   }
-  std::vector<ThinkingInfo> infos;
-  ThinkingInfo thinking;
-  thinking.depth = 1;
-  infos.push_back(thinking);
-  responder->OutputThinkingInfo(&infos);
-  BestMoveInfo info(best);
-  responder->OutputBestMove(&info);
 }
 
-}  // namespace
+void Engine::EnsureSyzygyTablebasesLoaded() {
+  const std::string tb_paths = options_.Get<std::string>(kSyzygyTablebaseId);
+  if (tb_paths == previous_tb_paths_) return;
+  previous_tb_paths_ = tb_paths;
 
-void EngineController::Go(const GoParams& params) {
-  // TODO: should consecutive calls to go be considered to be a continuation and
-  // hence have the same start time like this behaves, or should we check start
-  // time hasn't changed since last call to go and capture the new start time
-  // now?
-  if (strict_uci_timing_ || !move_start_time_) ResetMoveTimer();
-  go_params_ = params;
-
-  std::unique_ptr<UciResponder> responder =
-      std::make_unique<NonOwningUciRespondForwarder>(uci_responder_.get());
-
-  // Setting up current position, now that it's known whether it's ponder or
-  // not.
-  if (params.ponder && !current_position_.moves.empty()) {
-    std::vector<std::string> moves(current_position_.moves);
-    std::string ponder_move = moves.back();
-    moves.pop_back();
-    SetupPosition(current_position_.fen, moves);
-    responder = std::make_unique<PonderResponseTransformer>(
-        std::move(responder), ponder_move);
+  if (tb_paths.empty()) {
+    LOGFILE << "Reset Syzygy tablebases.";
+    syzygy_tb_.reset();
   } else {
-    SetupPosition(current_position_.fen, current_position_.moves);
-  }
-
-  if (!options_.Get<bool>(kUciChess960)) {
-    // Remap FRC castling to legacy castling.
-    responder = std::make_unique<Chess960Transformer>(
-        std::move(responder), tree_->HeadPosition().GetBoard());
+    syzygy_tb_ = std::make_unique<SyzygyTablebase>();
+    CERR << "Loading Syzygy tablebases from " << tb_paths;
+    if (!syzygy_tb_->init(tb_paths)) {
+      CERR << "Failed to load Syzygy tablebases!";
+      syzygy_tb_.reset();
+    }
   }
 
-  if (!options_.Get<bool>(kShowWDL)) {
-    // Strip WDL information from the response.
-    responder = std::make_unique<WDLResponseFilter>(std::move(responder));
-  }
+  search_->SetSyzygyTablebase(syzygy_tb_.get());
+}
 
-  if (!options_.Get<bool>(kShowMovesleft)) {
-    // Strip movesleft information from the response.
-    responder = std::make_unique<MovesLeftResponseFilter>(std::move(responder));
-  }
-  if (options_.Get<bool>(kValueOnly)) {
-    ValueOnlyGo(tree_.get(), network_.get(), options_, std::move(responder));
+// Initializes the search with either the specified position for the normal
+// search or the position one ply trimmed for the ponder search.
+void Engine::InitializeSearchPosition(bool for_ponder) {
+  LOGFILE << "Setting a new search position.";
+  assert(last_position_);
+  if (!for_ponder) {
+    search_->SetPosition(*last_position_);
     return;
   }
-
-  if (options_.Get<Button>(kClearTree).TestAndReset()) {
-    tree_->TrimTreeAtHead();
+  if (last_position_->moves.empty()) {
+    throw Exception("Ponder search requires at least one move.");
   }
-
-  auto stopper = time_manager_->GetStopper(params, *tree_.get());
-  search_ = std::make_unique<Search>(
-      *tree_, network_.get(), std::move(responder),
-      StringsToMovelist(params.searchmoves, tree_->HeadPosition().GetBoard()),
-      *move_start_time_, std::move(stopper), params.infinite, params.ponder,
-      options_, &cache_, syzygy_tb_.get());
-
-  LOGFILE << "Timer started at "
-          << FormatTime(SteadyClockToSystemClock(*move_start_time_));
-  search_->StartThreads(options_.Get<int>(kThreadsOptionId));
+  GameState position = *last_position_;
+  position.moves.pop_back();
+  search_->SetPosition(position);
+  return;
 }
 
-void EngineController::PonderHit() {
-  ResetMoveTimer();
-  go_params_.ponder = false;
-  Go(go_params_);
-}
-
-void EngineController::Stop() {
-  if (search_) search_->Stop();
-}
-
-EngineLoop::EngineLoop()
-    : engine_(
-          std::make_unique<CallbackUciResponder>(
-              std::bind(&UciLoop::SendBestMove, this, std::placeholders::_1),
-              std::bind(&UciLoop::SendInfo, this, std::placeholders::_1)),
-          options_.GetOptionsDict()) {
-  engine_.PopulateOptions(&options_);
-  options_.Add<StringOption>(kLogFileId);
+void Engine::SetPosition(const std::string& fen,
+                         const std::vector<std::string>& moves) {
+  EnsureSearchStopped();
+  ponder_enabled_ = options_.Get<bool>(kPonderId);
+  strict_uci_timing_ = options_.Get<bool>(kStrictUciTiming);
+  isready_seen_ = false;
+  search_->StartClock();
+  UpdateBackendConfig();
+  EnsureSyzygyTablebasesLoaded();
+  last_position_ = MakeGameState(fen, moves);
+  if (!ponder_enabled_) InitializeSearchPosition(/*for_ponder=*/false);
 }
 
-void EngineLoop::RunLoop() {
-  if (!ConfigFile::Init() || !options_.ProcessAllFlags()) return;
-  const auto options = options_.GetOptionsDict();
-  Logging::Get().SetFilename(options.Get<std::string>(kLogFileId));
-  if (options.Get<bool>(kPreload)) engine_.NewGame();
-  UciLoop::RunLoop();
+void Engine::NewGame() {
+  if (backend_) backend_->ClearCache();
+  search_->NewGame();
+  SetPosition(ChessBoard::kStartposFen, {});
 }
 
-void EngineLoop::CmdUci() {
-  SendId();
-  for (const auto& option : options_.ListOptionsUci()) {
-    SendResponse(option);
+void Engine::Go(const GoParams& params) {
+  if (!ponder_enabled_ && params.ponder) {
+    throw Exception(
+        "Ponder is not enabled, but the ponder search is requested.");
+  }
+  if ((strict_uci_timing_ && isready_seen_) ||
+      !(params.wtime || params.btime)) {
+    search_->StartClock();
   }
-  SendResponse("uciok");
+  if (!last_position_) NewGame();
+  if (ponder_enabled_) InitializeSearchPosition(params.ponder);
+  last_go_params_ = params;
+  search_->StartSearch(params);
 }
 
-void EngineLoop::CmdIsReady() {
-  engine_.EnsureReady();
-  SendResponse("readyok");
-}
+void Engine::EnsureReady() { isready_seen_ = true; }
 
-void EngineLoop::CmdSetOption(const std::string& name, const std::string& value,
-                              const std::string& context) {
-  options_.SetUciOption(name, value, context);
-  // Set the log filename for the case it was set in UCI option.
-  Logging::Get().SetFilename(
-      options_.GetOptionsDict().Get<std::string>(kLogFileId));
-}
+void Engine::Wait() { search_->WaitSearch(); }
 
-void EngineLoop::CmdUciNewGame() { engine_.NewGame(); }
+void Engine::Stop() { search_->StopSearch(); }
 
-void EngineLoop::CmdPosition(const std::string& position,
-                             const std::vector<std::string>& moves) {
-  std::string fen = position;
-  if (fen.empty()) {
-    fen = ChessBoard::kStartposFen;
+void Engine::PonderHit() {
+  if (!last_go_params_ || !last_go_params_->ponder) {
+    throw Exception("ponderhit while not pondering");
   }
-  engine_.SetPosition(fen, moves);
+  EnsureSearchStopped();
+  search_->StartClock();
+  last_go_params_->ponder = false;
+  InitializeSearchPosition(/*ponder=*/false);
+  search_->StartSearch(*last_go_params_);
 }
 
-void EngineLoop::CmdFen() {
-  std::string fen = GetFen(engine_.ApplyPositionMoves());
-  return SendResponse(fen);
+void Engine::RegisterUciResponder(UciResponder* responder) {
+  uci_forwarder_->Register(responder);
 }
-void EngineLoop::CmdGo(const GoParams& params) { engine_.Go(params); }
 
-void EngineLoop::CmdPonderHit() { engine_.PonderHit(); }
-
-void EngineLoop::CmdStop() { engine_.Stop(); }
+void Engine::UnregisterUciResponder(UciResponder* responder) {
+  uci_forwarder_->Unregister(responder);
+}
 
 }  // namespace lczero
diff --git a/src/engine.h b/src/engine.h
index 9743679a43..e50d661393 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -1,6 +1,6 @@
 /*
   This file is part of Leela Chess Zero.
-  Copyright (C) 2018 The LCZero Authors
+  Copyright (C) 2024 The LCZero Authors
 
   Leela Chess is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -27,115 +27,64 @@
 
 #pragma once
 
-#include <optional>
+#include <vector>
 
-#include "chess/uciloop.h"
-#include "mcts/search.h"
-#include "neural/cache.h"
-#include "neural/factory.h"
-#include "neural/network.h"
+#include "chess/gamestate.h"
+#include "engine_loop.h"
+#include "neural/memcache.h"
+#include "search/search.h"
 #include "syzygy/syzygy.h"
-#include "utils/mutex.h"
-#include "utils/optionsparser.h"
 
 namespace lczero {
 
-struct CurrentPosition {
-  std::string fen;
-  std::vector<std::string> moves;
-};
-
-class EngineController {
+class Engine : public EngineControllerBase {
  public:
-  EngineController(std::unique_ptr<UciResponder> uci_responder,
-                   const OptionsDict& options);
-
-  ~EngineController() {
-    // Make sure search is destructed first, and it still may be running in
-    // a separate thread.
-    search_.reset();
-  }
+  Engine(const SearchFactory&, const OptionsDict&);
+  ~Engine() override;
 
-  void PopulateOptions(OptionsParser* options);
+  static void PopulateOptions(OptionsParser*);
 
-  // Blocks.
-  void EnsureReady();
-
-  // Must not block.
-  void NewGame();
-
-  // Blocks.
+  void EnsureReady() override;
+  void NewGame() override;
   void SetPosition(const std::string& fen,
-                   const std::vector<std::string>& moves);
-
-  // Must not block.
-  void Go(const GoParams& params);
-  void PonderHit();
-  // Must not block.
-  void Stop();
+                   const std::vector<std::string>& moves) override;
+  void Go(const GoParams& params) override;
+  void PonderHit() override;
+  void Wait() override;
+  void Stop() override;
 
-  Position ApplyPositionMoves();
+  void RegisterUciResponder(UciResponder*) override;
+  void UnregisterUciResponder(UciResponder*) override;
 
  private:
-  void UpdateFromUciOptions();
-
-  void SetupPosition(const std::string& fen,
-                     const std::vector<std::string>& moves);
-  void ResetMoveTimer();
-  void CreateFreshTimeManager();
+  void UpdateBackendConfig();
+  void EnsureSearchStopped();
+  void EnsureSyzygyTablebasesLoaded();
+  void InitializeSearchPosition(bool for_ponder);
 
+  class UciPonderForwarder;
+  std::unique_ptr<UciPonderForwarder> uci_forwarder_;
   const OptionsDict& options_;
-
-  std::unique_ptr<UciResponder> uci_responder_;
-
-  // Locked means that there is some work to wait before responding readyok.
-  RpSharedMutex busy_mutex_;
-  using SharedLock = std::shared_lock<RpSharedMutex>;
-
-  std::unique_ptr<TimeManager> time_manager_;
-  std::unique_ptr<Search> search_;
-  std::unique_ptr<NodeTree> tree_;
-  std::unique_ptr<SyzygyTablebase> syzygy_tb_;
-  std::unique_ptr<Network> network_;
-  NNCache cache_;
-
-  // Store current TB and network settings to track when they change so that
-  // they are reloaded.
-  std::string tb_paths_;
-  NetworkFactory::BackendConfiguration network_configuration_;
-
-  // The current position as given with SetPosition. For normal (ie. non-ponder)
-  // search, the tree is set up with this position, however, during ponder we
-  // actually search the position one move earlier.
-  CurrentPosition current_position_;
-  GoParams go_params_;
-
-  std::optional<std::chrono::steady_clock::time_point> move_start_time_;
-
-  // If true we can reset move_start_time_ in "Go".
-  bool strict_uci_timing_;
-};
-
-class EngineLoop : public UciLoop {
- public:
-  EngineLoop();
-
-  void RunLoop() override;
-  void CmdUci() override;
-  void CmdIsReady() override;
-  void CmdSetOption(const std::string& name, const std::string& value,
-                    const std::string& context) override;
-  void CmdUciNewGame() override;
-  void CmdPosition(const std::string& position,
-                   const std::vector<std::string>& moves) override;
-  void CmdFen() override;
-  void CmdGo(const GoParams& params) override;
-  void CmdPonderHit() override;
-  void CmdStop() override;
-
- private:
-  OptionsParser options_;
-  EngineController engine_;
+  std::unique_ptr<SearchBase> search_;  // absl_notnull
+  std::string backend_name_;  // Remember the backend name to track changes.
+  std::unique_ptr<CachingBackend> backend_;  // absl_nullable
+
+  // Remember previous tablebase paths to detect when to reload them.
+  std::string previous_tb_paths_;
+  std::unique_ptr<SyzygyTablebase> syzygy_tb_;  // absl_nullable
+
+  // UCI parameters cache to be consistent between `position` and `go`.
+  // Defaults ensure corect operation even if `go` comes first.
+  bool ponder_enabled_ = false;
+  bool strict_uci_timing_ = true;
+  bool isready_seen_ = true;
+  // Last position set for the search. Used to:
+  // 1. Detect whether the position was ever set (to initialize to startpos).
+  // 2. Remember the position for ponder go (removing the last ply).
+  // 3. Remember the position for ponderhit.
+  std::optional<GameState> last_position_ = std::nullopt;
+  // Go parameters for the last search. Used on ponder.
+  std::optional<GoParams> last_go_params_ = std::nullopt;
 };
 
-}  // namespace lczero
+}  // namespace lczero
\ No newline at end of file
diff --git a/src/engine_loop.cc b/src/engine_loop.cc
new file mode 100644
index 0000000000..f919cd2a43
--- /dev/null
+++ b/src/engine_loop.cc
@@ -0,0 +1,84 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2018-2025 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include "engine_loop.h"
+
+#include <iostream>
+
+#include "engine.h"
+#include "neural/shared_params.h"
+#include "utils/configfile.h"
+
+namespace lczero {
+namespace {
+const OptionId kLogFileId{
+    {.long_flag = "logfile",
+     .uci_option = "LogFile",
+     .help_text = "Write log to that file. Special value <stderr> to "
+                  "output the log to the console.",
+     .short_flag = 'l',
+     .visibility = OptionId::kAlwaysVisible}};
+}  // namespace
+
+void RunEngine(SearchFactory* factory) {
+  CERR << "Search algorithm: " << factory->GetName();
+  StdoutUciResponder uci_responder;
+
+  // Populate options from various sources.
+  OptionsParser options_parser;
+  options_parser.Add<StringOption>(kLogFileId);
+  ConfigFile::PopulateOptions(&options_parser);
+  Engine::PopulateOptions(&options_parser);
+  if (factory) factory->PopulateParams(&options_parser);  // Search params.
+  uci_responder.PopulateParams(&options_parser);          // UCI params.
+  SharedBackendParams::Populate(&options_parser);
+
+  // Parse flags, show help, initialize logging, read config etc.
+  if (!ConfigFile::Init() || !options_parser.ProcessAllFlags()) return;
+  const auto options = options_parser.GetOptionsDict();
+  Logging::Get().SetFilename(options.Get<std::string>(kLogFileId));
+
+  // Create engine.
+  Engine engine(*factory, options);
+  UciLoop loop(&uci_responder, &options_parser, &engine);
+
+  // Run the stdin loop.
+  std::cout.setf(std::ios::unitbuf);
+  std::string line;
+  while (std::getline(std::cin, line)) {
+    LOGFILE << ">> " << line;
+    try {
+      if (!loop.ProcessLine(line)) break;
+      // Set the log filename for the case it was set in UCI option.
+      Logging::Get().SetFilename(options.Get<std::string>(kLogFileId));
+    } catch (Exception& ex) {
+      uci_responder.SendRawResponse(std::string("error ") + ex.what());
+    }
+  }
+}
+
+}  // namespace lczero
diff --git a/src/rescorer/rescoreloop.h b/src/engine_loop.h
similarity index 84%
rename from src/rescorer/rescoreloop.h
rename to src/engine_loop.h
index fac41c3bab..9baed9ad80 100644
--- a/src/rescorer/rescoreloop.h
+++ b/src/engine_loop.h
@@ -1,6 +1,6 @@
 /*
   This file is part of Leela Chess Zero.
-  Copyright (C) 2018-2024 The LCZero Authors
+  Copyright (C) 2018-2025 The LCZero Authors
 
   Leela Chess is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -27,23 +27,16 @@
 
 #pragma once
 
-#include <thread>
+#include <memory>
+#include <string>
 
 #include "chess/uciloop.h"
+#include "search/search.h"
 #include "utils/optionsparser.h"
 
 namespace lczero {
 
-class RescoreLoop : public UciLoop {
- public:
-  RescoreLoop();
-  ~RescoreLoop();
+// Runs the stdin/stdout UCI loop for the engine.
+void RunEngine(SearchFactory* factory);
 
-  void RunLoop() override;
-
- private:
-  OptionsParser options_;
-
-};
-
-}  // namespace lczero
+}  // namespace lczero
\ No newline at end of file
diff --git a/src/engine_test.cc b/src/engine_test.cc
new file mode 100644
index 0000000000..6b79aa5fbd
--- /dev/null
+++ b/src/engine_test.cc
@@ -0,0 +1,141 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2025 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include "engine.h"
+
+#include <future>
+
+#include "gtest/gtest.h"
+#include "neural/mock_backend.h"
+#include "neural/register.h"
+#include "neural/shared_params.h"
+#include "search/mock_search.h"
+#include "search/search.h"
+
+namespace lczero {
+namespace {
+
+using testing::_;
+using testing::Invoke;
+using testing::Return;
+
+class EngineTest : public ::testing::Test {
+ protected:
+  EngineTest() {
+    Engine::PopulateOptions(&options_parser_);
+    SharedBackendParams::Populate(&options_parser_);
+    options_ = options_parser_.GetMutableOptions();
+    auto backend_factory = std::make_unique<MockBackendFactory>();
+    backend_factory_ = backend_factory.get();
+    ON_CALL(*backend_factory_, GetName()).WillByDefault(Return("mock"));
+    ON_CALL(*backend_factory_, Create(_))
+        .WillByDefault([this](const OptionsDict&) {
+          auto backend = std::make_unique<MockBackend>();
+          backend_ = backend.get();
+          return backend;
+        });
+
+    BackendManager::Get()->AddBackend(std::move(backend_factory));
+
+    options_->Set<std::string>(SharedBackendParams::kBackendId, "mock");
+    options_->Set<int>(SharedBackendParams::kNNCacheSizeId, 10);
+
+    EXPECT_CALL(search_factory_, CreateSearch(_, _))
+        .WillOnce([&](UciResponder* responder, const OptionsDict*) {
+          auto search = std::make_unique<MockSearch>(responder);
+          search_ = search.get();
+          return search;
+        });
+  }
+
+  ~EngineTest() { BackendManager::Get()->RemoveBackend(backend_factory_); }
+
+  OptionsParser options_parser_;
+  OptionsDict* options_ = nullptr;  // absl_notnull
+  MockBackend* backend_ = nullptr;
+  MockBackendFactory* backend_factory_ = nullptr;
+  MockSearchFactory search_factory_;
+  MockSearch* search_ = nullptr;
+  std::unique_ptr<Engine> engine_;
+};
+
+class WaitingUciResponder : public UciResponder {
+ public:
+  WaitingUciResponder() {
+    bestmove_promise_ = std::make_unique<std::promise<void>>();
+  }
+
+  virtual void OutputBestMove(BestMoveInfo*) override {
+    bestmove_promise_->set_value();
+  }
+  virtual void OutputThinkingInfo(std::vector<ThinkingInfo>*) override {}
+
+  void Wait() {
+    bestmove_promise_->get_future().wait();
+    bestmove_promise_ = std::make_unique<std::promise<void>>();
+  }
+
+ private:
+  std::unique_ptr<std::promise<void>> bestmove_promise_;
+};
+
+TEST_F(EngineTest, BackendReloadByUpdateBackendConfig) {
+  WaitingUciResponder uci_responder;
+  Engine engine(search_factory_, *options_);
+  engine.RegisterUciResponder(&uci_responder);
+  EXPECT_EQ(backend_, nullptr);  // Backend not created before the search.
+  EXPECT_CALL(*search_, StartSearch(_)).WillRepeatedly([&](const GoParams&) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(30));
+    static BestMoveInfo bestmove_info(Move::White(kSquareE1, kSquareA1));
+    search_->GetUciResponder()->OutputBestMove(&bestmove_info);
+  });
+  engine.Go(GoParams{.nodes = 10});
+  uci_responder.Wait();
+  EXPECT_NE(backend_, nullptr);  // Backend created after the search.
+  Backend* prev_backend = backend_;
+  EXPECT_CALL(*backend_, UpdateConfiguration(_))
+      .WillOnce(Return(Backend::UPDATE_OK));
+  engine.NewGame();
+  engine.Go(GoParams{.nodes = 10});
+  uci_responder.Wait();
+  EXPECT_EQ(backend_, prev_backend);  // Backend not recreated.
+  EXPECT_CALL(*backend_, UpdateConfiguration(_))
+      .WillOnce(Return(Backend::NEED_RESTART));
+  engine.Go(GoParams{.nodes = 10});  // Go alone should not restart the backend.
+  uci_responder.Wait();
+  EXPECT_EQ(backend_, prev_backend);
+  engine.NewGame();
+  EXPECT_NE(backend_, prev_backend);  // Backend recreated.
+}
+
+}  // namespace
+}  // namespace lczero
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/main.cc b/src/main.cc
index 4c8880d4e6..dc83a199e8 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -25,20 +25,59 @@
   Program grant you additional permission to convey the resulting work.
 */
 
-#include "benchmark/backendbench.h"
-#include "benchmark/benchmark.h"
 #include "chess/board.h"
+#include "default_search.h"
 #include "engine.h"
-#include "lc0ctl/describenet.h"
-#include "lc0ctl/leela2onnx.h"
-#include "lc0ctl/onnx2leela.h"
+#include "search/register.h"
 #include "selfplay/loop.h"
+#include "tools/backendbench.h"
+#include "tools/benchmark.h"
+#include "tools/describenet.h"
+#include "tools/leela2onnx.h"
+#include "tools/onnx2leela.h"
 #include "utils/commandline.h"
 #include "utils/esc_codes.h"
 #include "utils/logging.h"
+#include "utils/trace.h"
 #include "version.h"
 
+namespace lczero {
+void ChooseAndRunEngine() {
+  // First try the engine which is explicitly specified on the command line.
+  for (const std::string_view search_name :
+       SearchManager::Get()->GetSearchNames()) {
+    if (CommandLine::ConsumeCommand(search_name)) {
+      RunEngine(SearchManager::Get()->GetFactoryByName(search_name));
+      return;
+    }
+  }
+
+  // Then if DEFAULT_SEARCH is defined, run the engine specified by it.
+#ifdef DEFAULT_SEARCH
+  SearchFactory* factory =
+      SearchManager::Get()->GetFactoryByName(DEFAULT_SEARCH);
+  if (!factory) throw Exception("Unknown search algorithm: " DEFAULT_SEARCH);
+  RunEngine(factory);
+  return;
+#endif
+
+  // Then try to run the engine which is specified by the name of the binary.
+  const std::string& binary_name = CommandLine::BinaryName();
+  for (const std::string_view search_name :
+       SearchManager::Get()->GetSearchNames()) {
+    if (binary_name.find(search_name) != std::string::npos) {
+      RunEngine(SearchManager::Get()->GetFactoryByName(search_name));
+      return;
+    }
+  }
+
+  // Finally, run "classic" search through the new API.
+  RunEngine(SearchManager::Get()->GetFactoryByName("classic"));
+}
+}  // namespace lczero
+
 int main(int argc, const char** argv) {
+  LCTRACE_INITIALIZE;
   using namespace lczero;
   EscCodes::Init();
   LOGFILE << "Lc0 started.";
@@ -51,25 +90,38 @@ int main(int argc, const char** argv) {
     InitializeMagicBitboards();
 
     CommandLine::Init(argc, argv);
-    CommandLine::RegisterMode("uci", "(default) Act as UCI engine");
-    CommandLine::RegisterMode("selfplay", "Play games with itself");
-    CommandLine::RegisterMode("benchmark", "Quick benchmark");
-    CommandLine::RegisterMode("backendbench",
-                              "Quick benchmark of backend only");
-    CommandLine::RegisterMode("leela2onnx", "Convert Leela network to ONNX.");
-    CommandLine::RegisterMode("onnx2leela",
-                              "Convert ONNX network to Leela net.");
-    CommandLine::RegisterMode("describenet",
-                              "Shows details about the Leela network.");
+    if (CommandLine::BinaryName().find("simple") == std::string::npos) {
+      CommandLine::RegisterMode("selfplay", "Play games with itself");
+      CommandLine::RegisterMode("benchmark", "Quick benchmark");
+      CommandLine::RegisterMode("bench", "Very quick benchmark");
+      CommandLine::RegisterMode("backendbench",
+                                "Quick benchmark of backend only");
+      CommandLine::RegisterMode("leela2onnx", "Convert Leela network to ONNX.");
+      CommandLine::RegisterMode("onnx2leela",
+                                "Convert ONNX network to Leela net.");
+      CommandLine::RegisterMode("describenet",
+                                "Shows details about the Leela network.");
+    }
+    for (const std::string_view search_name :
+         SearchManager::Get()->GetSearchNames()) {
+      CommandLine::RegisterMode(
+          std::string(search_name),
+          "Use \"" + std::string(search_name) + "\" search");
+    }
 
     if (CommandLine::ConsumeCommand("selfplay")) {
       // Selfplay mode.
-      SelfPlayLoop loop;
-      loop.RunLoop();
+      StdoutUciResponder uci_responder;
+      SelfPlayLoop loop(&uci_responder);
+      loop.Run();
     } else if (CommandLine::ConsumeCommand("benchmark")) {
-      // Benchmark mode.
+      // Benchmark mode, longer version.
       Benchmark benchmark;
       benchmark.Run();
+    } else if (CommandLine::ConsumeCommand("bench")) {
+      // Benchmark mode, shorter version.
+      Benchmark benchmark;
+      benchmark.Run(/*run_shorter_benchmark=*/true);
     } else if (CommandLine::ConsumeCommand("backendbench")) {
       // Backend Benchmark mode.
       BackendBenchmark benchmark;
@@ -81,11 +133,7 @@ int main(int argc, const char** argv) {
     } else if (CommandLine::ConsumeCommand("describenet")) {
       lczero::DescribeNetworkCmd();
     } else {
-      // Consuming optional "uci" mode.
-      CommandLine::ConsumeCommand("uci");
-      // Ordinary UCI engine.
-      EngineLoop loop;
-      loop.RunLoop();
+      lczero::ChooseAndRunEngine();
     }
   } catch (std::exception& e) {
     std::cerr << "Unhandled exception: " << e.what() << std::endl;
diff --git a/src/neural/backend.cc b/src/neural/backend.cc
new file mode 100644
index 0000000000..fee27f2ade
--- /dev/null
+++ b/src/neural/backend.cc
@@ -0,0 +1,68 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2024 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include "neural/backend.h"
+
+#include <string>
+
+#include "neural/shared_params.h"
+#include "utils/hashcat.h"
+
+namespace lczero {
+
+std::vector<EvalResult> Backend::EvaluateBatch(
+    std::span<const EvalPosition> positions) {
+  std::vector<EvalResult> results;
+  results.reserve(positions.size());
+  std::unique_ptr<BackendComputation> computation = CreateComputation();
+  for (const EvalPosition& pos : positions) {
+    results.emplace_back();
+    EvalResult& result = results.back();
+    result.p.resize(pos.legal_moves.size());
+    computation->AddInput(
+        pos, EvalResultPtr{&result.q, &result.d, &result.m,
+                           std::span<float>(result.p.data(), result.p.size())});
+  }
+  computation->ComputeBlocking();
+  return results;
+}
+
+uint64_t Backend::ConfigurationHash(const OptionsDict& options) const {
+  uint64_t hash = std::hash<std::string>{}(
+      options.Get<std::string>(SharedBackendParams::kBackendId));
+  hash = HashCat(hash, std::hash<std::string>{}(options.Get<std::string>(
+                           SharedBackendParams::kBackendOptionsId)));
+  hash = HashCat(hash, std::hash<std::string>{}(options.Get<std::string>(
+                           SharedBackendParams::kWeightsId)));
+  hash = HashCat(hash, std::hash<float>{}(options.Get<float>(
+                           SharedBackendParams::kPolicySoftmaxTemp)));
+  hash = HashCat(hash, std::hash<std::string>{}(options.Get<std::string>(
+                           SharedBackendParams::kHistoryFill)));
+  return hash;
+}
+
+}  // namespace lczero
\ No newline at end of file
diff --git a/src/neural/backend.h b/src/neural/backend.h
new file mode 100644
index 0000000000..a04b8f62d1
--- /dev/null
+++ b/src/neural/backend.h
@@ -0,0 +1,136 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2024 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <span>
+#include <vector>
+
+#include "chess/position.h"
+#include "neural/loader.h"
+#include "utils/optionsdict.h"
+
+namespace lczero {
+
+// Information about the backend or network that search may need.
+struct BackendAttributes {
+  bool has_mlh;
+  bool has_wdl;
+  bool runs_on_cpu;
+  int suggested_num_search_threads;
+  int recommended_batch_size;
+  int maximum_batch_size;
+};
+
+struct EvalResultPtr {
+  float* q = nullptr;
+  float* d = nullptr;
+  float* m = nullptr;
+  std::span<float> p = {};
+};
+
+struct EvalResult {
+  float q;
+  float d;
+  float m;
+  std::vector<float> p;
+
+  EvalResultPtr AsPtr() {
+    return EvalResultPtr{.q = &q, .d = &d, .m = &m, .p = p};
+  }
+};
+
+struct EvalPosition {
+  std::span<const Position> pos;
+  std::span<const Move> legal_moves;
+};
+
+class BackendComputation {
+ public:
+  virtual ~BackendComputation() = default;
+  virtual size_t UsedBatchSize() const = 0;
+  enum AddInputResult {
+    ENQUEUED_FOR_EVAL = 0,    // Will be computed during ComputeBlocking();
+    FETCHED_IMMEDIATELY = 1,  // Was in cache, the result is already populated.
+  };
+  virtual AddInputResult AddInput(
+      const EvalPosition& pos,    // Input position.
+      EvalResultPtr result) = 0;  // Where to fetch data into.
+  virtual void ComputeBlocking() = 0;
+};
+
+class Backend {
+ public:
+  virtual ~Backend() = default;
+  virtual BackendAttributes GetAttributes() const = 0;
+  virtual std::unique_ptr<BackendComputation> CreateComputation() = 0;
+
+  // Simple helper with default implementation, to evaluate a batch without
+  // creating a computation explicitly.
+  virtual std::vector<EvalResult> EvaluateBatch(
+      std::span<const EvalPosition> positions);
+  // Returns the evaluation if it's possible to do immediately.
+  virtual std::optional<EvalResult> GetCachedEvaluation(const EvalPosition&) {
+    return std::nullopt;
+  }
+
+  // Updates the configuration of the backend. This is between searches.
+  // It's up to the backend to detect if the configuration has changed.
+  enum UpdateConfigurationResult {
+    UPDATE_OK = 0,     // Backend handled the update by itself (if needed).
+    NEED_RESTART = 1,  // Recreate the backend.
+  };
+  virtual UpdateConfigurationResult UpdateConfiguration(
+      const OptionsDict& opts) {
+    current_config_hash_ = ConfigurationHash(opts);
+    return UPDATE_OK;
+  }
+
+  virtual bool IsSameConfiguration(const OptionsDict& opts) const {
+    return ConfigurationHash(opts) == current_config_hash_;
+  }
+
+ private:
+  // Gets a hash of the backend configuration, to help detect changes.
+  virtual uint64_t ConfigurationHash(const OptionsDict&) const;
+
+  uint64_t current_config_hash_;
+};
+
+class BackendFactory {
+ public:
+  virtual ~BackendFactory() = default;
+  // Higher priority is higher.
+  virtual int GetPriority() const = 0;
+  virtual std::string_view GetName() const = 0;
+  virtual std::unique_ptr<Backend> Create(const OptionsDict&) = 0;
+};
+
+}  // namespace lczero
\ No newline at end of file
diff --git a/src/neural/blas/README.md b/src/neural/backends/blas/README.md
similarity index 100%
rename from src/neural/blas/README.md
rename to src/neural/backends/blas/README.md
diff --git a/src/neural/blas/blas.h b/src/neural/backends/blas/blas.h
similarity index 92%
rename from src/neural/blas/blas.h
rename to src/neural/backends/blas/blas.h
index 7001be64d7..a9018c71d2 100644
--- a/src/neural/blas/blas.h
+++ b/src/neural/backends/blas/blas.h
@@ -18,6 +18,13 @@
 
 #pragma once
 
+// clang-format off
+// math.h include is workaround for Eigen trying to use math functions from global
+// namespaces. math.h must be included before Eigen/Core.
+#include <math.h>
+#include <Eigen/Core>
+// clang-format on
+
 // Select the BLAS vendor based on defines
 
 #ifdef USE_MKL
diff --git a/src/neural/blas/convolution1.cc b/src/neural/backends/blas/convolution1.cc
similarity index 97%
rename from src/neural/blas/convolution1.cc
rename to src/neural/backends/blas/convolution1.cc
index c78b3976fb..1da550cb5b 100644
--- a/src/neural/blas/convolution1.cc
+++ b/src/neural/backends/blas/convolution1.cc
@@ -16,10 +16,8 @@
  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include "neural/blas/convolution1.h"
-#include "neural/blas/blas.h"
-
-#include <Eigen/Dense>
+#include "neural/backends/blas/convolution1.h"
+#include "neural/backends/blas/blas.h"
 
 namespace lczero {
 template <typename T>
diff --git a/src/neural/blas/convolution1.h b/src/neural/backends/blas/convolution1.h
similarity index 100%
rename from src/neural/blas/convolution1.h
rename to src/neural/backends/blas/convolution1.h
diff --git a/src/neural/blas/encoder.h b/src/neural/backends/blas/encoder.h
similarity index 98%
rename from src/neural/blas/encoder.h
rename to src/neural/backends/blas/encoder.h
index 28cf3b16b1..b4544b16ff 100644
--- a/src/neural/blas/encoder.h
+++ b/src/neural/backends/blas/encoder.h
@@ -20,8 +20,6 @@
 
 #include <cmath>
 
-#include "neural/shared/activation.h"
-
 #ifdef USE_ISPC
 #include "layer_norm_ispc.h"
 #endif
diff --git a/src/neural/blas/fully_connected_layer.cc b/src/neural/backends/blas/fully_connected_layer.cc
similarity index 97%
rename from src/neural/blas/fully_connected_layer.cc
rename to src/neural/backends/blas/fully_connected_layer.cc
index d228d2523f..d0736c1eb3 100644
--- a/src/neural/blas/fully_connected_layer.cc
+++ b/src/neural/backends/blas/fully_connected_layer.cc
@@ -16,16 +16,13 @@
  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include "neural/blas/fully_connected_layer.h"
-#include "neural/blas/blas.h"
-#include "neural/shared/activation.h"
+#include "neural/backends/blas/fully_connected_layer.h"
+#include "neural/backends/blas/blas.h"
 
 #include <algorithm>
 #include <cassert>
 #include <cmath>
 
-#include <Eigen/Dense>
-
 namespace lczero {
 namespace {
 void ApplyBias(size_t batch_size, const size_t output_size, const float* biases,
diff --git a/src/neural/blas/fully_connected_layer.h b/src/neural/backends/blas/fully_connected_layer.h
similarity index 96%
rename from src/neural/blas/fully_connected_layer.h
rename to src/neural/backends/blas/fully_connected_layer.h
index 1917c41378..821947cef4 100644
--- a/src/neural/blas/fully_connected_layer.h
+++ b/src/neural/backends/blas/fully_connected_layer.h
@@ -18,7 +18,7 @@
 
 #pragma once
 
-#include "neural/shared/activation.h"
+#include "neural/backends/shared/activation.h"
 
 #include <cstddef>
 #include <vector>
diff --git a/src/neural/blas/layer_norm.ispc b/src/neural/backends/blas/layer_norm.ispc
similarity index 100%
rename from src/neural/blas/layer_norm.ispc
rename to src/neural/backends/blas/layer_norm.ispc
diff --git a/src/neural/blas/network_blas.cc b/src/neural/backends/blas/network_blas.cc
similarity index 97%
rename from src/neural/blas/network_blas.cc
rename to src/neural/backends/blas/network_blas.cc
index 607700df7d..c91c5c44f5 100644
--- a/src/neural/blas/network_blas.cc
+++ b/src/neural/backends/blas/network_blas.cc
@@ -16,25 +16,24 @@
  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include <Eigen/Core>
 #include <algorithm>
 #include <cassert>
 #include <cmath>
 #include <iostream>
 
-#include "neural/blas/blas.h"
-#include "neural/blas/convolution1.h"
-#include "neural/blas/encoder.h"
-#include "neural/blas/fully_connected_layer.h"
-#include "neural/blas/se_unit.h"
-#include "neural/blas/winograd_convolution3.h"
+#include "neural/backends/blas/blas.h"
+#include "neural/backends/blas/convolution1.h"
+#include "neural/backends/blas/encoder.h"
+#include "neural/backends/blas/fully_connected_layer.h"
+#include "neural/backends/blas/se_unit.h"
+#include "neural/backends/blas/winograd_convolution3.h"
+#include "neural/backends/shared/activation.h"
+#include "neural/backends/shared/winograd_filter.h"
 #include "neural/factory.h"
 #include "neural/network.h"
 #include "neural/network_legacy.h"
-#include "neural/shared/activation.h"
-#include "neural/shared/attention_policy_map.h"
-#include "neural/shared/policy_map.h"
-#include "neural/shared/winograd_filter.h"
+#include "neural/tables/attention_policy_map.h"
+#include "neural/tables/policy_map.h"
 #include "utils/numa.h"
 
 #ifdef USE_DNNL
@@ -70,7 +69,7 @@ class BlasComputation : public NetworkComputation {
                   const ActivationFunction smolgen_activation,
                   const ActivationFunction ffn_activation,
                   const bool attn_policy, const bool attn_body,
-                  bool is_pe_dense_embedding);
+                  bool is_pe_dense_embedding, int threads);
 
   virtual ~BlasComputation() {}
 
@@ -157,13 +156,14 @@ template <bool use_eigen>
 class BlasNetwork : public Network {
  public:
   BlasNetwork(const WeightsFile& weights, const OptionsDict& options);
-  virtual ~BlasNetwork(){};
+  virtual ~BlasNetwork() {};
 
   std::unique_ptr<NetworkComputation> NewComputation() override {
     return std::make_unique<BlasComputation<use_eigen>>(
         this, weights_, policy_head_, value_head_, max_batch_size_, wdl_,
         moves_left_, conv_policy_, default_activation_, smolgen_activation_,
-        ffn_activation_, attn_policy_, attn_body_, is_pe_dense_embedding_);
+        ffn_activation_, attn_policy_, attn_body_, is_pe_dense_embedding_,
+        threads_);
   }
 
   const NetworkCapabilities& GetCapabilities() const override {
@@ -199,15 +199,16 @@ class BlasNetwork : public Network {
   const NetworkCapabilities capabilities_;
   MultiHeadWeights weights_;
   size_t max_batch_size_;
+  int threads_;
   bool wdl_;
   bool moves_left_;
   bool conv_policy_;
   bool attn_policy_;
   bool attn_body_;
   bool is_pe_dense_embedding_;
-  ActivationFunction default_activation_;
-  ActivationFunction smolgen_activation_;
-  ActivationFunction ffn_activation_;
+  ActivationFunction default_activation_ = ACTIVATION_NONE;
+  ActivationFunction smolgen_activation_ = ACTIVATION_NONE;
+  ActivationFunction ffn_activation_ = ACTIVATION_NONE;
   std::string policy_head_;
   std::string value_head_;
   std::mutex buffers_lock_;
@@ -222,7 +223,8 @@ BlasComputation<use_eigen>::BlasComputation(
     const bool conv_policy, const ActivationFunction default_activation,
     const ActivationFunction smolgen_activation,
     const ActivationFunction ffn_activation, const bool attn_policy,
-    const bool attn_body, bool is_pe_dense_embedding)
+    const bool attn_body, bool is_pe_dense_embedding,
+    [[maybe_unused]] int threads)
     : weights_(weights),
       max_batch_size_(max_batch_size),
       policies_(0),
@@ -240,7 +242,7 @@ BlasComputation<use_eigen>::BlasComputation(
       value_head_(value_head),
       network_(network) {
 #ifdef USE_DNNL
-  omp_set_num_threads(1);
+  omp_set_num_threads(threads);
 #endif
 }
 
@@ -989,6 +991,7 @@ BlasNetwork<use_eigen>::BlasNetwork(const WeightsFile& file,
 
   max_batch_size_ =
       static_cast<size_t>(options.GetOrDefault<int>("batch_size", 256));
+  threads_ = options.GetOrDefault<int>("threads", 1);
 
   auto nf = file.format().network_format();
   using NF = pblczero::NetworkFormat;
@@ -1075,7 +1078,7 @@ BlasNetwork<use_eigen>::BlasNetwork(const WeightsFile& file,
   } else {
 #ifdef USE_OPENBLAS
     int num_procs = openblas_get_num_procs();
-    openblas_set_num_threads(1);
+    openblas_set_num_threads(threads_);
     const char* core_name = openblas_get_corename();
     const char* config = openblas_get_config();
     CERR << "BLAS vendor: OpenBLAS.";
@@ -1084,7 +1087,7 @@ BlasNetwork<use_eigen>::BlasNetwork(const WeightsFile& file,
 #endif
 
 #ifdef USE_MKL
-    mkl_set_num_threads(1);
+    mkl_set_num_threads(threads_);
     CERR << "BLAS vendor: MKL.";
     constexpr int len = 256;
     char versionbuf[len];
diff --git a/src/neural/blas/se_unit.cc b/src/neural/backends/blas/se_unit.cc
similarity index 97%
rename from src/neural/blas/se_unit.cc
rename to src/neural/backends/blas/se_unit.cc
index a4f24f0e0b..68761f75f5 100644
--- a/src/neural/blas/se_unit.cc
+++ b/src/neural/backends/blas/se_unit.cc
@@ -16,8 +16,8 @@
  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include "neural/blas/se_unit.h"
-#include "neural/blas/fully_connected_layer.h"
+#include "neural/backends/blas/se_unit.h"
+#include "neural/backends/blas/fully_connected_layer.h"
 
 #include <cmath>
 
diff --git a/src/neural/blas/se_unit.h b/src/neural/backends/blas/se_unit.h
similarity index 96%
rename from src/neural/blas/se_unit.h
rename to src/neural/backends/blas/se_unit.h
index afc7f2dc15..a93bce1750 100644
--- a/src/neural/blas/se_unit.h
+++ b/src/neural/backends/blas/se_unit.h
@@ -18,7 +18,7 @@
 
 #pragma once
 
-#include "neural/shared/activation.h"
+#include "neural/backends/shared/activation.h"
 
 #include <cstddef>
 
diff --git a/src/neural/blas/winograd_convolution3.cc b/src/neural/backends/blas/winograd_convolution3.cc
similarity index 99%
rename from src/neural/blas/winograd_convolution3.cc
rename to src/neural/backends/blas/winograd_convolution3.cc
index cd77eb1a21..c1687aebe6 100644
--- a/src/neural/blas/winograd_convolution3.cc
+++ b/src/neural/backends/blas/winograd_convolution3.cc
@@ -16,8 +16,8 @@
  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include "neural/blas/winograd_convolution3.h"
-#include "neural/blas/blas.h"
+#include "neural/backends/blas/winograd_convolution3.h"
+#include "neural/backends/blas/blas.h"
 
 #include <algorithm>
 #include <cassert>
@@ -29,8 +29,6 @@
 #include "winograd_transform_ispc.h"
 #endif
 
-#include <Eigen/Dense>
-
 namespace lczero {
 template <typename T>
 using EigenMatrixMap =
diff --git a/src/neural/blas/winograd_convolution3.h b/src/neural/backends/blas/winograd_convolution3.h
similarity index 100%
rename from src/neural/blas/winograd_convolution3.h
rename to src/neural/backends/blas/winograd_convolution3.h
diff --git a/src/neural/blas/winograd_transform.ispc b/src/neural/backends/blas/winograd_transform.ispc
similarity index 100%
rename from src/neural/blas/winograd_transform.ispc
rename to src/neural/backends/blas/winograd_transform.ispc
diff --git a/src/neural/cuda/common_kernels.cu b/src/neural/backends/cuda/common_kernels.cu
similarity index 87%
rename from src/neural/cuda/common_kernels.cu
rename to src/neural/backends/cuda/common_kernels.cu
index 395bab8d84..bab99ce4cf 100644
--- a/src/neural/cuda/common_kernels.cu
+++ b/src/neural/backends/cuda/common_kernels.cu
@@ -29,8 +29,9 @@
 #include <cassert>
 
 #include "cuda_common.h"
-#include "neural/shared/activation.h"
-#include "neural/shared/attention_policy_map.h"
+#include "neural/tables/activation_function.h"
+#include "neural/tables/attention_policy_map.h"
+#include "utils/exception.h"
 #include "winograd_helper.inc"
 
 namespace lczero {
@@ -381,12 +382,13 @@ __global__ void NCHWtoNHWC_kernel(dT* output_tensor, const sT* input_tensor,
 
 template <typename DstType, typename SrcType>
 void convertNCHWtoNHWC(DstType* output_tensor, const SrcType* input_tensor,
-                       int Nin, int Cin, int Nout, int Cout, int H, int W) {
+                       int Nin, int Cin, int Nout, int Cout, int H, int W,
+                       cudaStream_t stream) {
   size_t numElements = Nout * Cout * H * W;
   const int blockSize = 256;
   int blocks = DivUp(numElements, blockSize);
-  NCHWtoNHWC_kernel<<<blocks, blockSize>>>(output_tensor, input_tensor, Nin,
-                                           Cin, Nout, Cout, H, W);
+  NCHWtoNHWC_kernel<<<blocks, blockSize, 0, stream>>>(
+      output_tensor, input_tensor, Nin, Cin, Nout, Cout, H, W);
 }
 
 template <typename DstType, typename SrcType>
@@ -437,65 +439,20 @@ __global__ void batchNorm_kernel(T* output, const T* input, const T* skipInput,
 template <typename T>
 void batchNorm(T* output, const T* input, const T* skipInput, int N, int C,
                int H, int W, float* means, float* var_multipliers,
-               ActivationFunction activation) {
+               ActivationFunction activation, cudaStream_t stream) {
   const int total_elements = N * C * H * W;
   const int kBlockSize = 256;
   int blocks = DivUp(total_elements, kBlockSize);
 
-  batchNorm_kernel<<<blocks, kBlockSize>>>(output, input, skipInput, N, C, H, W,
-                                           means, var_multipliers, activation);
+  batchNorm_kernel<<<blocks, kBlockSize, 0, stream>>>(
+      output, input, skipInput, N, C, H, W, means, var_multipliers, activation);
 
   ReportCUDAErrors(cudaGetLastError());
 }
 
-__global__ void expandPlanes_kernel_Fp32_NCHW(float* output,
-                                              const uint64_t* masks,
-                                              const float* values, int n) {
-  // Block size of 256, same mask/val for 64 consecutive threads.
-  constexpr int kNumShmemElements = 256 / 64;
-
-  __shared__ uint64_t shMasks[kNumShmemElements];
-  __shared__ float shVals[kNumShmemElements];
-
-  int index = threadIdx.x + blockDim.x * blockIdx.x;
-
-  int planeIndex = index >> 6;
-
-  if (planeIndex >= n) return;
-
-  // Load inputs to shared memory.
-  if (threadIdx.x < kNumShmemElements) {
-    shMasks[threadIdx.x] = masks[planeIndex + threadIdx.x];
-    shVals[threadIdx.x] = values[planeIndex + threadIdx.x];
-  }
-  __syncthreads();
-
-  uint64_t mask = shMasks[threadIdx.x >> 6];
-
-  int sqIndex = index & 0x3F;
-  float op = 0;
-
-  bool set = !!(mask & (1ull << sqIndex));
-  if (set) {
-    op = shVals[threadIdx.x >> 6];
-  }
-  output[index] = op;
-}
-
-void expandPlanes_Fp32_NCHW(float* output, const uint64_t* masks,
-                            const float* values, int n, cudaStream_t stream) {
-  int threads = n * 8 * 8;  // Each thread writes a single element.
-  const int blockSize = 256;
-  int blocks = DivUp(threads, blockSize);
-  expandPlanes_kernel_Fp32_NCHW<<<blocks, blockSize, 0, stream>>>(output, masks,
-                                                                  values, n);
-  ReportCUDAErrors(cudaGetLastError());
-}
-
-// TODO: Can optimize using shared memory if this becomes a bottleneck.
-__global__ void expandPlanes_kernel_Fp16_NHWC(half* output,
-                                              const uint64_t* masks,
-                                              const float* values, int n) {
+template <typename T>
+__global__ void expandPlanes_kernel_NHWC(T* output, const uint64_t* masks,
+                                         const T* values, int n) {
   const int index = threadIdx.x + blockDim.x * blockIdx.x;
   if (index >= n * 8 * 8) return;
 
@@ -505,66 +462,61 @@ __global__ void expandPlanes_kernel_Fp16_NHWC(half* output,
 
   uint64_t mask = masks[boardIndex * kInputPlanes + planeIndex];
 
-  half op = 0;
+  T op = 0;
   bool set = !!(mask & (1ull << sqIndex));
   if (set) {
-    float val = values[boardIndex * kInputPlanes + planeIndex];
-    op = (half)val;
+    op = values[boardIndex * kInputPlanes + planeIndex];
   }
   output[index] = op;
 }
 
-void expandPlanes_Fp16_NHWC(half* output, const uint64_t* masks,
-                            const float* values, int n, cudaStream_t stream) {
+template <typename T>
+void expandPlanes_NHWC(T* output, const uint64_t* masks, const T* values, int n,
+                       cudaStream_t stream) {
   int threads = n * 8 * 8;  // Each thread writes a single element.
   const int kBlockSize = 256;
   int blocks = DivUp(threads, kBlockSize);
-  expandPlanes_kernel_Fp16_NHWC<<<blocks, kBlockSize, 0, stream>>>(
-      output, masks, values, n);
+  expandPlanes_kernel_NHWC<<<blocks, kBlockSize, 0, stream>>>(output, masks,
+                                                              values, n);
   ReportCUDAErrors(cudaGetLastError());
 }
 
-__global__ void expandPlanes_kernel_Fp16_NCHW(half* output,
-                                              const uint64_t* masks,
-                                              const float* values, int n) {
-  // block size of 256, same mask/val for 64 consecutive threads
-  constexpr int kNumShmemElements = 256 / 64;
-
-  __shared__ uint64_t shMasks[kNumShmemElements];
-  __shared__ half shVals[kNumShmemElements];
-
-  int index = threadIdx.x + blockDim.x * blockIdx.x;
+template <typename T>
+__global__ void expandPlanes_kernel_NCHW(T* output, const uint64_t* masks,
+                                         const T* values, unsigned n) {
+  unsigned index = threadIdx.x + blockDim.x * blockIdx.x;
 
-  int planeIndex = index >> 6;
+  index *= 2;
+  unsigned planeIndex = index >> 6;
 
   if (planeIndex >= n) return;
 
-  // load inputs to shared memory
-  if (threadIdx.x < kNumShmemElements) {
-    shMasks[threadIdx.x] = masks[planeIndex + threadIdx.x];
-    shVals[threadIdx.x] = values[planeIndex + threadIdx.x];
-  }
-  __syncthreads();
-
-  uint64_t mask = shMasks[threadIdx.x >> 6];
+  uint64_t mask = masks[planeIndex];
 
   int sqIndex = index & 0x3F;
-  half op = 0;
+  T op[2] = {0, 0};
 
   bool set = !!(mask & (1ull << sqIndex));
   if (set) {
-    op = (half)shVals[threadIdx.x >> 6];
+    op[0] = values[planeIndex];
   }
-  output[index] = op;
+  sqIndex++;
+  set = !!(mask & (1ull << sqIndex));
+  if (set) {
+    op[1] = values[planeIndex];
+  }
+  output[index + 0] = op[0];
+  output[index + 1] = op[1];
 }
 
-void expandPlanes_Fp16_NCHW(half* output, const uint64_t* masks,
-                            const float* values, int n, cudaStream_t stream) {
-  int threads = n * 8 * 8;  // each thread writes a single element
+template <typename T>
+void expandPlanes_NCHW(T* output, const uint64_t* masks, const T* values,
+                            int n, cudaStream_t stream) {
+  unsigned threads = n * 8 * 8 / 2;  // each thread writes two elements.
   const int blockSize = 256;
-  int blocks = DivUp(threads, blockSize);
-  expandPlanes_kernel_Fp16_NCHW<<<blocks, blockSize, 0, stream>>>(output, masks,
-                                                                  values, n);
+  unsigned blocks = DivUp(threads, blockSize);
+  expandPlanes_kernel_NCHW<<<blocks, blockSize, 0, stream>>>(output, masks,
+                                                             values, n);
   ReportCUDAErrors(cudaGetLastError());
 }
 
@@ -704,14 +656,14 @@ __global__ void globalAvgPool_kernel(T* output, const T* input,
 
 template <typename T>
 void globalAvgPool(int N, int C, T* output, const T* input,
-                   const T* prevLayerBias, bool nhwc) {
+                   const T* prevLayerBias, bool nhwc, cudaStream_t stream) {
   const int kPlaneSize = 64;
   if (nhwc) {
     assert((std::is_same<half, T>::value));
     // For NHWC fp16, simply launch N blocks, each with C threads.
-    globalAvgPool_kernel_NHWC_fp16<<<N, C>>>((half*)output, (half*)input,
-                                             (half*)prevLayerBias,
-                                             N * C * kPlaneSize, N * C);
+    globalAvgPool_kernel_NHWC_fp16<<<N, C, 0, stream>>>(
+        (half*)output, (half*)input, (half*)prevLayerBias, N * C * kPlaneSize,
+        N * C);
   } else {
     // For NCHW layout (used with fp32),
     // each warp processes a full plane (64 elements), and writes a single
@@ -722,8 +674,8 @@ void globalAvgPool(int N, int C, T* output, const T* input,
     const int kBlockSize = kWarpsPerBlock * 32;
 
     int blocks = DivUp(kTotalWarps, kWarpsPerBlock);
-    globalAvgPool_kernel<<<blocks, kBlockSize>>>(output, input, prevLayerBias,
-                                                 N * C * kPlaneSize, N * C, C);
+    globalAvgPool_kernel<<<blocks, kBlockSize, 0, stream>>>(
+        output, input, prevLayerBias, N * C * kPlaneSize, N * C, C);
   }
   ReportCUDAErrors(cudaGetLastError());
 }
@@ -731,18 +683,18 @@ void globalAvgPool(int N, int C, T* output, const T* input,
 template <typename T>
 void globalScale(int N, int C, T* output, const T* input, const T* scaleBias,
                  const T* prevLayerBias, bool nhwc,
-                 ActivationFunction activation) {
+                 ActivationFunction activation, cudaStream_t stream) {
   // Each thread writes one output.
   const int kBlockSize = 256;
   const int kBlocks = DivUp(N * 8 * 8 * C, kBlockSize);
 
   if (nhwc) {
     assert((std::is_same<half, T>::value));
-    globalScale_kernel_fp16_nhwc<<<kBlocks, kBlockSize>>>(
+    globalScale_kernel_fp16_nhwc<<<kBlocks, kBlockSize, 0, stream>>>(
         (half*)output, (half*)input, (half*)scaleBias, (half*)prevLayerBias,
         N * C * 8 * 8, C, 8 * 8 * C, activation);
   } else {
-    globalScale_kernel<<<kBlocks, kBlockSize>>>(
+    globalScale_kernel<<<kBlocks, kBlockSize, 0, stream>>>(
         output, input, scaleBias, prevLayerBias, N * C * 8 * 8, C, activation);
   }
   ReportCUDAErrors(cudaGetLastError());
@@ -808,6 +760,15 @@ void OutputInputTransform(int N, int C, int se_K, T* output, const T* input,
   ReportCUDAErrors(cudaGetLastError());
 }
 
+__device__ __forceinline__ float clamp(float val, float low, float high) {
+  if (__builtin_expect(isnan(val), 0)) return val;
+  return fminf(fmaxf(val, low), high);
+}
+
+namespace {
+constexpr float kTwiceHalfMax = 131008.0f;  // Twice the max finite fp16 value.
+}  // namespace
+
 // softmax along C dimension which is assumed to be 64
 // each thread processes two elements. Each warp computes a sum (over 64
 // elements)
@@ -843,6 +804,11 @@ __global__ void softmax_opt_64_kernel(T* output, const T* input,
     x[0] += x[2];
     x[1] += x[3];
   }
+  if (fp16) {
+    // Guard against Inf from fp16 overflow.
+    x[0] = clamp(x[0], -kTwiceHalfMax, kTwiceHalfMax);
+    x[1] = clamp(x[1], -kTwiceHalfMax, kTwiceHalfMax);
+  }
   float threadMax = max(x[0], x[1]);
   float maxval = warpMax(threadMax);
   maxval = __shfl_sync(0xFFFFFFFF, maxval, 0);
@@ -884,6 +850,10 @@ __global__ void softmax_kernel(T* output, const T* input, const T* input2) {
 
   float x = (float)input[index];
   if (input2 != nullptr) x += (float)input2[index];
+  if (std::is_same<half, T>::value) {
+    // Guard against Inf from fp16 overflow.
+    x = clamp(x, -kTwiceHalfMax, kTwiceHalfMax);
+  }
 
   __shared__ float sum, maxval;
   if (c == 0) {
@@ -1242,7 +1212,8 @@ __global__ void preprocess_for_attention_body_kernel(
   if (c >= input_size) {
     // concatenate from position encoding array
     if (is_pe_dense_embedding) {
-      op = (T)(encoding[n * 64 * encoding_size + hw * encoding_size + (c - input_size)]);
+      op = (T)(encoding[n * 64 * encoding_size + hw * encoding_size +
+                        (c - input_size)]);
     } else {
       op = (T)(encoding[64 * hw + (c - input_size)]);
     }
@@ -1309,6 +1280,64 @@ void applyInputGating(T* output, const T* input, const T* mult, const T* add,
   ReportCUDAErrors(cudaGetLastError());
 }
 
+template <typename T, int kWorkPerThread>
+__global__ void genOffsetPointers_kernel(T** offsets, int heads, int block_size,
+                                         int depth, int d_model, T* k, T* q,
+                                         T* b1, T* v, T* b2) {
+  const int i = (blockIdx.x * blockDim.x + threadIdx.x) * kWorkPerThread;
+  if (i >= block_size) return;
+  const int h = i % heads;
+  const int n = i / heads;
+  int w;
+  T* res[kWorkPerThread];
+  for (w = 0; w < kWorkPerThread; w++) {
+    res[w] = k + h * depth + 64 * d_model * n + w * depth;
+    offsets[i + w] = res[w];
+  }
+
+  for (w = 0; w < kWorkPerThread; w++) {
+    res[w] = q + h * depth + 64 * d_model * n + w * depth;
+    offsets[i + w + block_size] = res[w];
+  }
+
+  for (w = 0; w < kWorkPerThread; w++) {
+    res[w] = b1 + i * 64 * 64 + w * 64 * 64;
+    offsets[i + w + 2 * block_size] = res[w];
+  }
+
+  for (w = 0; w < kWorkPerThread; w++) {
+    res[w] = v + h * depth + 64 * d_model * n + w * depth;
+    offsets[i + w + 3 * block_size] = res[w];
+  }
+
+  for (w = 0; w < kWorkPerThread; w++) {
+    res[w] = b2 + h * depth + 64 * d_model * n + w * depth;
+    offsets[i + w + 4 * block_size] = res[w];
+  }
+}
+
+template <typename T>
+void genOffsetPointers(T** offsets, int heads, int max_batch, int depth,
+                       int d_model, T* k, T* q, T* b1, T* v, T* b2,
+                       cudaStream_t stream) {
+  const int block_size = heads * max_batch;
+  // Process two elements per thread to use 128 bit store instructions.
+  constexpr int kWorkPerThread = 2;
+  constexpr int kWorkGroupSize = 128;
+  if (block_size % kWorkPerThread != 0) {
+    // Handle odd block sizes.
+    int grid = DivUp(block_size, kWorkGroupSize);
+    genOffsetPointers_kernel<T, 1><<<grid, kWorkGroupSize, 0, stream>>>(
+        offsets, heads, block_size, depth, d_model, k, q, b1, v, b2);
+  } else {
+    // Handle even block size
+    int grid = DivUp(block_size, kWorkGroupSize * kWorkPerThread);
+    genOffsetPointers_kernel<T, kWorkPerThread>
+        <<<grid, kWorkGroupSize, 0, stream>>>(offsets, heads, block_size, depth,
+                                              d_model, k, q, b1, v, b2);
+  }
+}
+
 // Template instantiation.
 template void copyTypeConverted<half, float>(half* op, float* ip, int N,
                                              cudaStream_t stream);
@@ -1322,11 +1351,13 @@ template void copyTypeConverted<half, half>(half* op, half* ip, int N,
 template void batchNorm<float>(float* output, const float* input,
                                const float* skipInput, int N, int C, int H,
                                int W, float* means, float* var_multipliers,
-                               ActivationFunction activation);
+                               ActivationFunction activation,
+                               cudaStream_t stream);
 template void batchNorm<half>(half* output, const half* input,
                               const half* skipInput, int N, int C, int H, int W,
                               float* means, float* var_multipliers,
-                              ActivationFunction activation);
+                              ActivationFunction activation,
+                              cudaStream_t stream);
 
 template void addVectors<float>(float* c, float* a, float* b, int size,
                                 int asize, int bsize, ActivationFunction act,
@@ -1368,18 +1399,36 @@ template void addBias_NCHW<half>(half* c, half* a, half* b, int N, int C, int H,
 
 template void globalAvgPool<float>(int N, int C, float* output,
                                    const float* input,
-                                   const float* prevLayerBias, bool nhwc);
+                                   const float* prevLayerBias, bool nhwc,
+                                   cudaStream_t stream);
 template void globalAvgPool<half>(int N, int C, half* output, const half* input,
-                                  const half* prevLayerBias, bool nhwc);
+                                  const half* prevLayerBias, bool nhwc,
+                                  cudaStream_t stream);
+
+template void expandPlanes_NHWC<float>(float* output, const uint64_t* masks,
+                                       const float* values, int n,
+                                       cudaStream_t stream);
+template void expandPlanes_NHWC<half>(half* output, const uint64_t* masks,
+                                      const half* values, int n,
+                                      cudaStream_t stream);
+
+template void expandPlanes_NCHW<float>(float* output, const uint64_t* masks,
+                                       const float* values, int n,
+                                       cudaStream_t stream);
+template void expandPlanes_NCHW<half>(half* output, const uint64_t* masks,
+                                      const half* values, int n,
+                                      cudaStream_t stream);
 
 template void globalScale<float>(int N, int C, float* output,
                                  const float* input, const float* scaleBias,
                                  const float* prevLayerBias, bool nhwc,
-                                 ActivationFunction activation);
+                                 ActivationFunction activation,
+                                 cudaStream_t stream);
 template void globalScale<half>(int N, int C, half* output, const half* input,
                                 const half* scaleBias,
                                 const half* prevLayerBias, bool nhwc,
-                                ActivationFunction activation);
+                                ActivationFunction activation,
+                                cudaStream_t stream);
 
 template void PolicyMap<float>(int N, float* output, const float* input,
                                const short* indices, int inputSize,
@@ -1391,7 +1440,7 @@ template void PolicyMap<half>(int N, half* output, const half* input,
                               int outputSize, cudaStream_t stream);
 
 template void FilterTransform<float>(int N, int C, float* transformedFilter,
-                                     const float* filter);
+                                     const float* filter, cudaStream_t stream);
 
 template void InputTransform<float, true>(int N, int C,
                                           float* transformed_input,
@@ -1566,15 +1615,16 @@ template void ComputePromotionLogits<float>(int N, int C, float* output,
 template void convertNCHWtoNHWC<half, float>(half* output_tensor,
                                              const float* input_tensor, int Nin,
                                              int Cin, int Nout, int Cout, int H,
-                                             int W);
+                                             int W, cudaStream_t stream);
 template void convertNCHWtoNHWC<float, float>(float* output_tensor,
                                               const float* input_tensor,
                                               int Nin, int Cin, int Nout,
-                                              int Cout, int H, int W);
+                                              int Cout, int H, int W,
+                                              cudaStream_t stream);
 template void convertNCHWtoNHWC<half, half>(half* output_tensor,
                                             const half* input_tensor, int Nin,
                                             int Cin, int Nout, int Cout, int H,
-                                            int W);
+                                            int W, cudaStream_t stream);
 
 template void inputPreprocessForAttentionBody<half>(
     half* output, const half* input, const half* encoding, int N,
@@ -1595,5 +1645,14 @@ template void applyInputGating<float>(float* output, const float* input,
                                       const float* mult, const float* add,
                                       int N, int C, int output_size,
                                       cudaStream_t stream);
+
+template void genOffsetPointers<float>(float** offsets, int heads,
+                                       int max_batch, int depth, int d_model,
+                                       float* k, float* q, float* b1, float* v,
+                                       float* b2, cudaStream_t stream);
+template void genOffsetPointers<half>(half** offsets, int heads, int max_batch,
+                                      int depth, int d_model, half* k, half* q,
+                                      half* b1, half* v, half* b2,
+                                      cudaStream_t stream);
 }  // namespace cudnn_backend
 }  // namespace lczero
diff --git a/src/neural/cuda/cuda_common.h b/src/neural/backends/cuda/cuda_common.h
similarity index 98%
rename from src/neural/cuda/cuda_common.h
rename to src/neural/backends/cuda/cuda_common.h
index ca91f0e91b..1babb7e003 100644
--- a/src/neural/cuda/cuda_common.h
+++ b/src/neural/backends/cuda/cuda_common.h
@@ -30,7 +30,7 @@
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
 
-#include "utils/exception.h"
+#include "utils/fp16_utils.h"
 
 #ifdef USE_CUDNN
 #include <cudnn.h>
diff --git a/src/neural/backends/cuda/cutlass_kernels.cu b/src/neural/backends/cuda/cutlass_kernels.cu
new file mode 100644
index 0000000000..619c839f90
--- /dev/null
+++ b/src/neural/backends/cuda/cutlass_kernels.cu
@@ -0,0 +1,124 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2018 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include "neural/backends/cuda/cuda_common.h"
+
+// Fused MHA implementation from cutlass example #41
+#include "fused_multi_head_attention/kernel_forward.h"
+#include "utils/exception.h"
+
+namespace lczero {
+namespace cudnn_backend {
+
+template <bool bias>
+void fusedMHACutlass(void* output, void* q, void* k, void* v, void* skip,
+                     int batch_size, int num_heads, int depth,
+                     cudaStream_t stream) {
+  cutlass::half_t* mha_q = (cutlass::half_t*)q;
+  cutlass::half_t* mha_k = (cutlass::half_t*)k;
+  cutlass::half_t* mha_v = (cutlass::half_t*)v;
+
+  constexpr int kQueriesPerBlock = 64;
+  constexpr int kKeysPerBlock = 64;
+  constexpr bool kSingleValueIteration = true;
+
+  using Attention =
+      AttentionKernel<cutlass::half_t,      // scalar_t
+                      cutlass::arch::Sm80,  // ArchTag
+                      true,                 // Memory is aligned
+                      kQueriesPerBlock, kKeysPerBlock, kSingleValueIteration,
+                      false,  // Supports dropout
+                      bias    // Supports bias
+                      >;
+  static_assert(
+      !Attention::kNeedsOutputAccumulatorBuffer,
+      "Unhandled case in cutlass MHA: needs output accumulator buffer");
+
+  typename Attention::Params p;
+  {  // set parameters
+    p.query_ptr = mha_q;
+    p.key_ptr = mha_k;
+    p.value_ptr = mha_v;
+    p.logsumexp_ptr = nullptr;  // Only needed for bw
+    p.output_accum_ptr = nullptr;
+    p.output_ptr = (cutlass::half_t*)output;
+    p.attn_bias_ptr = (cutlass::half_t*)skip;
+
+    p.scale = 1.0f / sqrt((float)depth);
+
+    p.num_heads = num_heads;
+    p.num_batches = batch_size;
+    p.head_dim = depth;
+    p.head_dim_value = depth;
+    p.num_queries = 64;
+    p.num_keys = 64;
+
+    // All tensors are in BMHK shapes
+    p.q_strideH = depth;
+    p.k_strideH = depth;
+    p.v_strideH = depth;
+    p.q_strideM = depth * num_heads;
+    p.k_strideM = depth * num_heads;
+    p.v_strideM = depth * num_heads;
+    p.q_strideB = p.q_strideM * 64;
+    p.k_strideB = p.k_strideM * 64;
+    p.v_strideB = p.v_strideM * 64;
+    p.o_strideM = p.head_dim_value * p.num_heads;
+
+    p.bias_strideH = 64 * 64;
+    p.bias_strideM = 64;
+    p.bias_strideB = num_heads * p.bias_strideH;
+  }
+
+  constexpr auto kernel_fn = attention_kernel_batched_impl<Attention>;
+  int smem_bytes = sizeof(typename Attention::SharedStorage);
+  if (smem_bytes > 0xc000) {
+    ReportCUDAErrors(cudaFuncSetAttribute(
+        kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes));
+  }
+  if (!Attention::check_supported(p)) {
+    throw Exception("Unhandled case in cutlass MHA: check_supported failed.");
+  }
+
+  kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes, stream>>>(p);
+
+  ReportCUDAErrors(cudaGetLastError());
+}
+
+void fusedMHA(void* output, void* mha_q, void* mha_k, void* mha_v, void* skip,
+              int batch_size, int num_heads, int depth, cudaStream_t stream) {
+  if (skip == nullptr) {
+    fusedMHACutlass<false>(output, mha_q, mha_k, mha_v, skip, batch_size,
+                           num_heads, depth, stream);
+  } else {
+    fusedMHACutlass<true>(output, mha_q, mha_k, mha_v, skip, batch_size,
+                          num_heads, depth, stream);
+  }
+}
+
+}  // namespace cudnn_backend
+}  // namespace lczero
diff --git a/src/neural/cuda/fp16_kernels.cu b/src/neural/backends/cuda/fp16_kernels.cu
similarity index 91%
rename from src/neural/cuda/fp16_kernels.cu
rename to src/neural/backends/cuda/fp16_kernels.cu
index f30b14f50d..37827ba0eb 100644
--- a/src/neural/cuda/fp16_kernels.cu
+++ b/src/neural/backends/cuda/fp16_kernels.cu
@@ -26,7 +26,8 @@
 */
 
 #include "cuda_common.h"
-#include "neural/shared/activation.h"
+#include "neural/tables/activation_function.h"
+#include "utils/exception.h"
 
 // Allow building on an old architecture.
 #if __CUDA_ARCH__ < 530
@@ -137,61 +138,61 @@ __global__ void SE_Layer_NHWC(half* output, const half* skip, const half* input,
 bool Se_Fp16_NHWC(int N, int C, int numFc1Out, half* output, const half* skip,
                   const half* input, const half* w1, const half* b1,
                   const half* w2, const half* b2, const half* bPrev,
-                  ActivationFunction activation) {
+                  ActivationFunction activation, cudaStream_t stream) {
   // TODO: Think of more elegant way to avoid this hardcoding :-/
   if (numFc1Out == 16) {
     if (C == 64) {
-      SE_Layer_NHWC<64, 16>
-          <<<N, C>>>(output, skip, input, w1, b1, w2, b2, bPrev, activation);
+      SE_Layer_NHWC<64, 16><<<N, C, 0, stream>>>(output, skip, input, w1, b1,
+                                                 w2, b2, bPrev, activation);
     } else {
       // TODO: support other channel counts.
       throw Exception("channel count unsupported by SE layer");
     }
   } else if (numFc1Out == 32) {
     if (C == 64) {
-      SE_Layer_NHWC<64, 32>
-          <<<N, C>>>(output, skip, input, w1, b1, w2, b2, bPrev, activation);
+      SE_Layer_NHWC<64, 32><<<N, C, 0, stream>>>(output, skip, input, w1, b1,
+                                                 w2, b2, bPrev, activation);
     } else if (C == 128) {
-      SE_Layer_NHWC<128, 32>
-          <<<N, C>>>(output, skip, input, w1, b1, w2, b2, bPrev, activation);
+      SE_Layer_NHWC<128, 32><<<N, C, 0, stream>>>(output, skip, input, w1, b1,
+                                                  w2, b2, bPrev, activation);
     } else if (C == 192) {
-      SE_Layer_NHWC<192, 32>
-          <<<N, C>>>(output, skip, input, w1, b1, w2, b2, bPrev, activation);
+      SE_Layer_NHWC<192, 32><<<N, C, 0, stream>>>(output, skip, input, w1, b1,
+                                                  w2, b2, bPrev, activation);
     } else if (C == 256) {
-      SE_Layer_NHWC<256, 32>
-          <<<N, C>>>(output, skip, input, w1, b1, w2, b2, bPrev, activation);
+      SE_Layer_NHWC<256, 32><<<N, C, 0, stream>>>(output, skip, input, w1, b1,
+                                                  w2, b2, bPrev, activation);
     } else if (C == 320) {
-      SE_Layer_NHWC<320, 32>
-          <<<N, C>>>(output, skip, input, w1, b1, w2, b2, bPrev, activation);
+      SE_Layer_NHWC<320, 32><<<N, C, 0, stream>>>(output, skip, input, w1, b1,
+                                                  w2, b2, bPrev, activation);
     } else if (C == 352) {
-      SE_Layer_NHWC<352, 32>
-          <<<N, C>>>(output, skip, input, w1, b1, w2, b2, bPrev, activation);
+      SE_Layer_NHWC<352, 32><<<N, C, 0, stream>>>(output, skip, input, w1, b1,
+                                                  w2, b2, bPrev, activation);
     } else if (C == 384) {
-      SE_Layer_NHWC<384, 32>
-          <<<N, C>>>(output, skip, input, w1, b1, w2, b2, bPrev, activation);
+      SE_Layer_NHWC<384, 32><<<N, C, 0, stream>>>(output, skip, input, w1, b1,
+                                                  w2, b2, bPrev, activation);
     } else {
       // TODO: support other channel counts.
       return false;
     }
   } else if (numFc1Out == 64) {
     if (C == 64) {
-      SE_Layer_NHWC<64, 64>
-          <<<N, C>>>(output, skip, input, w1, b1, w2, b2, bPrev, activation);
+      SE_Layer_NHWC<64, 64><<<N, C, 0, stream>>>(output, skip, input, w1, b1,
+                                                 w2, b2, bPrev, activation);
     } else if (C == 128) {
-      SE_Layer_NHWC<128, 64>
-          <<<N, C>>>(output, skip, input, w1, b1, w2, b2, bPrev, activation);
+      SE_Layer_NHWC<128, 64><<<N, C, 0, stream>>>(output, skip, input, w1, b1,
+                                                  w2, b2, bPrev, activation);
     } else if (C == 192) {
-      SE_Layer_NHWC<192, 64>
-          <<<N, C>>>(output, skip, input, w1, b1, w2, b2, bPrev, activation);
+      SE_Layer_NHWC<192, 64><<<N, C, 0, stream>>>(output, skip, input, w1, b1,
+                                                  w2, b2, bPrev, activation);
     } else if (C == 256) {
-      SE_Layer_NHWC<256, 64>
-          <<<N, C>>>(output, skip, input, w1, b1, w2, b2, bPrev, activation);
+      SE_Layer_NHWC<256, 64><<<N, C, 0, stream>>>(output, skip, input, w1, b1,
+                                                  w2, b2, bPrev, activation);
     } else if (C == 320) {
-      SE_Layer_NHWC<320, 64>
-          <<<N, C>>>(output, skip, input, w1, b1, w2, b2, bPrev, activation);
+      SE_Layer_NHWC<320, 64><<<N, C, 0, stream>>>(output, skip, input, w1, b1,
+                                                  w2, b2, bPrev, activation);
     } else if (C == 384) {
-      SE_Layer_NHWC<384, 64>
-          <<<N, C>>>(output, skip, input, w1, b1, w2, b2, bPrev, activation);
+      SE_Layer_NHWC<384, 64><<<N, C, 0, stream>>>(output, skip, input, w1, b1,
+                                                  w2, b2, bPrev, activation);
     } else {
       // TODO: support other channel counts.
       return false;
@@ -474,7 +475,7 @@ void OutputInputTransform(int N, int C, int se_K, T* output, const T* input,
 }
 
 template void FilterTransform<half>(int N, int C, half* transformedFilter,
-                                    const half* filter);
+                                    const half* filter, cudaStream_t stream);
 
 template void InputTransform<half, true>(int N, int C, half* transformed_input,
                                          const half* input,
diff --git a/src/neural/backends/cuda/inputs_outputs.h b/src/neural/backends/cuda/inputs_outputs.h
new file mode 100644
index 0000000000..89e728da84
--- /dev/null
+++ b/src/neural/backends/cuda/inputs_outputs.h
@@ -0,0 +1,300 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2018 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#pragma once
+
+#include <cassert>
+#include <memory>
+
+#include "cuda_common.h"
+#include "neural/network.h"
+#include "utils/bit.h"
+
+namespace lczero {
+namespace cudnn_backend {
+
+inline void ToType(float& dst, float src) { dst = src; }
+inline void ToType(half& dst, float src) {
+  auto temp = FP32toFP16(src);
+  dst = bit_cast<half>(temp);
+}
+
+inline float FromType(float src) { return src; }
+inline float FromType(half src) {
+  uint16_t temp = bit_cast<uint16_t>(src);
+  return FP16toFP32(temp);
+}
+
+template <typename DataType>
+struct CudaGraphCapture;
+
+template <typename DataType>
+struct CudaGraphExec {
+  ~CudaGraphExec() {
+    if (graph_exec_ != nullptr) {
+      ReportCUDAErrors(cudaGraphExecDestroy(graph_exec_));
+    }
+  }
+
+  CudaGraphExec& operator=(const CudaGraphCapture<DataType>&);
+  explicit operator bool() const { return graph_exec_ != nullptr; }
+
+  void Launch(cudaStream_t stream) {
+    ReportCUDAErrors(cudaGraphLaunch(graph_exec_, stream));
+  }
+  cudaGraphExec_t graph_exec_ = nullptr;
+};
+
+template <typename DataType>
+struct InputsOutputs {
+  InputsOutputs(unsigned maxBatchSize, bool wdl, bool moves_left,
+                size_t tensor_mem_size = 0, size_t scratch_size = 0,
+                bool cublasDisableTensorCores = false) {
+    ReportCUDAErrors(cudaHostAlloc(
+        &input_masks_mem_, maxBatchSize * kInputPlanes * sizeof(uint64_t),
+        cudaHostAllocMapped));
+    ReportCUDAErrors(cudaMalloc(
+        &input_masks_mem_gpu_, maxBatchSize * kInputPlanes * sizeof(uint64_t)));
+
+    ReportCUDAErrors(
+        cudaHostAlloc(&input_val_mem_,
+                      maxBatchSize * kInputPlanes * sizeof(input_val_mem_[0]),
+                      cudaHostAllocMapped));
+    ReportCUDAErrors(cudaMalloc(
+        &input_val_mem_gpu_,
+        maxBatchSize * kInputPlanes * sizeof(input_val_mem_gpu_[0])));
+
+    ReportCUDAErrors(cudaHostAlloc(
+        &op_policy_mem_,
+        maxBatchSize * kNumOutputPolicy * sizeof(op_policy_mem_[0]), 0));
+
+    // Seperate device memory copy for policy output.
+    // It's faster to write to device memory and then copy to host memory
+    // than having the kernel write directly to it.
+    ReportCUDAErrors(cudaMalloc(
+        &op_policy_mem_gpu_,
+        maxBatchSize * kNumOutputPolicy * sizeof(op_policy_mem_[0])));
+    ReportCUDAErrors(cudaHostAlloc(
+        &op_value_mem_, maxBatchSize * (wdl ? 3 : 1) * sizeof(op_value_mem_[0]),
+        cudaHostAllocMapped));
+    ReportCUDAErrors(cudaMalloc(
+        &op_value_mem_gpu_,
+        maxBatchSize * (wdl ? 3 : 1) * sizeof(op_value_mem_gpu_[0])));
+    if (wdl && sizeof(DataType) != sizeof(float)) {
+      wdl_cpu_softmax_ = std::make_unique<float[]>(maxBatchSize * 2);
+    }
+    ReportCUDAErrors(
+        cudaEventCreateWithFlags(&upload_done_event_, cudaEventDisableTiming));
+    ReportCUDAErrors(
+        cudaEventCreateWithFlags(&policy_done_event_, cudaEventDisableTiming));
+    ReportCUDAErrors(
+        cudaEventCreateWithFlags(&value_done_event_, cudaEventDisableTiming));
+    ReportCUDAErrors(cudaEventCreateWithFlags(&wdl_download_done_event_,
+                                              cudaEventDisableTiming));
+    ReportCUDAErrors(cudaEventCreateWithFlags(&download_done_event_,
+                                              cudaEventDisableTiming));
+    if (moves_left) {
+      ReportCUDAErrors(cudaHostAlloc(
+          &op_moves_left_mem_, maxBatchSize * sizeof(op_moves_left_mem_[0]),
+          cudaHostAllocMapped));
+      ReportCUDAErrors(
+          cudaMalloc(&op_moves_left_mem_gpu_,
+                     maxBatchSize * sizeof(op_moves_left_mem_gpu_[0])));
+      ReportCUDAErrors(cudaEventCreateWithFlags(&moves_left_done_event_,
+                                                cudaEventDisableTiming));
+    }
+
+    ReportCUDAErrors(
+        cudaStreamCreateWithFlags(&exec_stream_, cudaStreamNonBlocking));
+    ReportCUDAErrors(
+        cudaEventCreateWithFlags(&join_capture_event_, cudaEventDisableTiming));
+    cuda_graphs_ = std::make_unique<CudaGraphExec<DataType>[]>(maxBatchSize);
+
+    // memory for network execution managed inside this structure
+    if (tensor_mem_size) {
+      multi_stream_ = true;
+      ReportCUDAErrors(
+          cudaStreamCreateWithFlags(&compute_stream_, cudaStreamNonBlocking));
+      ReportCUDAErrors(
+          cudaStreamCreateWithFlags(&upload_stream_, cudaStreamNonBlocking));
+      ReportCUDAErrors(
+          cudaStreamCreateWithFlags(&download_stream_, cudaStreamNonBlocking));
+      ReportCUDAErrors(cudaMalloc(&scratch_mem_, scratch_size));
+      for (auto& mem : tensor_mem_) {
+        ReportCUDAErrors(cudaMalloc(&mem, tensor_mem_size));
+        ReportCUDAErrors(
+            cudaMemsetAsync(mem, 0, tensor_mem_size, compute_stream_));
+      }
+      ReportCUBLASErrors(cublasCreate(&cublas_));
+      ReportCUBLASErrors(cublasSetMathMode(
+          cublas_, cublasDisableTensorCores ? CUBLAS_PEDANTIC_MATH
+                                            : CUBLAS_TENSOR_OP_MATH));
+      ReportCUBLASErrors(cublasSetStream(cublas_, compute_stream_));
+    } else {
+      multi_stream_ = false;
+    }
+  }
+  ~InputsOutputs() {
+    ReportCUDAErrors(cudaFreeHost(input_masks_mem_));
+    ReportCUDAErrors(cudaFree(input_masks_mem_gpu_));
+    ReportCUDAErrors(cudaFreeHost(input_val_mem_));
+    ReportCUDAErrors(cudaFree(input_val_mem_gpu_));
+    ReportCUDAErrors(cudaFreeHost(op_policy_mem_));
+    ReportCUDAErrors(cudaFree(op_policy_mem_gpu_));
+    ReportCUDAErrors(cudaFreeHost(op_value_mem_));
+    ReportCUDAErrors(cudaFree(op_value_mem_gpu_));
+    ReportCUDAErrors(cudaEventDestroy(upload_done_event_));
+    ReportCUDAErrors(cudaEventDestroy(policy_done_event_));
+    ReportCUDAErrors(cudaEventDestroy(value_done_event_));
+    ReportCUDAErrors(cudaEventDestroy(wdl_download_done_event_));
+    ReportCUDAErrors(cudaEventDestroy(download_done_event_));
+    if (op_moves_left_mem_ != nullptr) {
+      ReportCUDAErrors(cudaFreeHost(op_moves_left_mem_));
+      ReportCUDAErrors(cudaFree(op_moves_left_mem_gpu_));
+      ReportCUDAErrors(cudaEventDestroy(moves_left_done_event_));
+    }
+    ReportCUDAErrors(cudaEventDestroy(join_capture_event_));
+    ReportCUDAErrors(cudaStreamDestroy(exec_stream_));
+
+    if (multi_stream_) {
+      for (auto mem : tensor_mem_) {
+        if (mem) ReportCUDAErrors(cudaFree(mem));
+      }
+      if (scratch_mem_) ReportCUDAErrors(cudaFree(scratch_mem_));
+      if (offset_pointers_) ReportCUDAErrors(cudaFree(offset_pointers_));
+      if (head_offset_pointers_) {
+        ReportCUDAErrors(cudaFree(head_offset_pointers_));
+      }
+      ReportCUDAErrors(cudaStreamDestroy(compute_stream_));
+      ReportCUDAErrors(cudaStreamDestroy(upload_stream_));
+      ReportCUDAErrors(cudaStreamDestroy(download_stream_));
+      ReportCUBLASErrors(cublasDestroy(cublas_));
+    }
+  }
+  uint64_t* input_masks_mem_;
+  DataType* input_val_mem_;
+  DataType* op_policy_mem_;
+  DataType* op_value_mem_;
+  DataType* op_moves_left_mem_ = nullptr;
+
+  // Copies in VRAM.
+  uint64_t* input_masks_mem_gpu_;
+  DataType* input_val_mem_gpu_;
+  DataType* op_policy_mem_gpu_;
+  DataType* op_value_mem_gpu_;
+  DataType* op_moves_left_mem_gpu_ = nullptr;
+
+  std::unique_ptr<float[]> wdl_cpu_softmax_;
+
+  // memory needed to run the network owned by InputsOutputs when multi_stream
+  // is enabled
+  bool multi_stream_;
+  void* tensor_mem_[3];
+  void* scratch_mem_;
+  void** offset_pointers_ = nullptr;
+  void** head_offset_pointers_ = nullptr;
+
+  // cuda stream used to run the network
+  cudaStream_t compute_stream_ = nullptr;
+  cudaStream_t upload_stream_ = nullptr;
+  cudaStream_t download_stream_ = nullptr;
+
+  // cuda events to synchronize between streams
+  cudaEvent_t upload_done_event_ = nullptr;
+  cudaEvent_t policy_done_event_ = nullptr;
+  cudaEvent_t value_done_event_ = nullptr;
+  cudaEvent_t moves_left_done_event_ = nullptr;
+  cudaEvent_t wdl_download_done_event_ = nullptr;
+  cudaEvent_t download_done_event_ = nullptr;
+
+  // cuda graph support
+  cudaStream_t exec_stream_ = nullptr;
+  std::unique_ptr<CudaGraphExec<DataType>[]> cuda_graphs_;
+  cudaEvent_t join_capture_event_ = nullptr;
+
+  // cublas handle used to run the network
+  cublasHandle_t cublas_ = nullptr;
+};
+
+template <typename DataType>
+struct CudaGraphCapture {
+  static constexpr int kMinimumFreeMemory = 100 * 1024 * 1024;
+
+  CudaGraphCapture(InputsOutputs<DataType>& io, cudaStream_t upload_stream,
+                   cudaStream_t download_stream)
+      : io_(io),
+        upload_stream_(upload_stream),
+        download_stream_(download_stream) {
+    ReportCUDAErrors(cudaStreamBeginCapture(upload_stream_,
+                                            cudaStreamCaptureModeThreadLocal));
+  }
+
+  ~CudaGraphCapture() {
+    if (graph_ != nullptr) {
+      ReportCUDAErrors(cudaGraphDestroy(graph_));
+    }
+  }
+
+  static bool EnsureEnoughFreeMemory() {
+    size_t free_mem = 0;
+    size_t total_mem = 0;
+    ReportCUDAErrors(cudaMemGetInfo(&free_mem, &total_mem));
+    return free_mem > kMinimumFreeMemory;
+  }
+
+  void EndCapture() {
+    ReportCUDAErrors(
+        cudaEventRecord(io_.join_capture_event_, download_stream_));
+    ReportCUDAErrors(
+        cudaStreamWaitEvent(upload_stream_, io_.join_capture_event_, 0));
+    ReportCUDAErrors(cudaStreamEndCapture(upload_stream_, &graph_));
+  }
+
+  InputsOutputs<DataType>& io_;
+  cudaStream_t upload_stream_;
+  cudaStream_t download_stream_;
+
+  cudaGraph_t graph_ = nullptr;
+};
+
+template <typename DataType>
+inline CudaGraphExec<DataType>& CudaGraphExec<DataType>::operator=(
+    const CudaGraphCapture<DataType>& graph) {
+  assert(graph_exec_ == nullptr);
+  if (graph.graph_ == nullptr) {
+    throw Exception("Trying to instantiate an nullptr cuda graph");
+  }
+  ReportCUDAErrors(
+      cudaGraphInstantiate(&graph_exec_, graph.graph_, nullptr, nullptr, 0));
+#if CUDART_VERSION >= 11010
+  ReportCUDAErrors(cudaGraphUpload(graph_exec_, graph.io_.exec_stream_));
+#endif
+  return *this;
+}
+
+}  // namespace cudnn_backend
+}  // namespace lczero
diff --git a/src/neural/cuda/kernels.h b/src/neural/backends/cuda/kernels.h
similarity index 85%
rename from src/neural/cuda/kernels.h
rename to src/neural/backends/cuda/kernels.h
index a1a2145737..91ee87abe0 100644
--- a/src/neural/cuda/kernels.h
+++ b/src/neural/backends/cuda/kernels.h
@@ -28,7 +28,7 @@
 #pragma once
 
 #include "cuda_common.h"
-#include "neural/shared/activation.h"
+#include "neural/tables/activation_function.h"
 
 namespace lczero {
 namespace cudnn_backend {
@@ -67,7 +67,8 @@ void addBias_NCHW(T* c, T* a, T* b, int N, int C, int H, int W,
 // params, also pad/un-pad elements from Batch or Channel dimensions
 template <typename DstType, typename SrcType>
 void convertNCHWtoNHWC(DstType* output_tensor, const SrcType* input_tensor,
-                       int Nin, int Cin, int Nout, int Cout, int H, int W);
+                       int Nin, int Cin, int Nout, int Cout, int H, int W,
+                       cudaStream_t stream);
 
 // Plain data-type conversion (no layout conversion).
 template <typename DstType, typename SrcType>
@@ -77,35 +78,34 @@ void copyTypeConverted(DstType* op, SrcType* ip, int N, cudaStream_t stream);
 template <typename T>
 void batchNorm(T* output, const T* input, const T* skipInput, int N, int C,
                int H, int W, float* means, float* var_multipliers,
-               ActivationFunction activation);
+               ActivationFunction activation, cudaStream_t stream);
 
 // Unpack planes (input to network).
-void expandPlanes_Fp32_NCHW(float* output, const uint64_t* masks,
-                            const float* values, int n, cudaStream_t stream);
-
-void expandPlanes_Fp16_NHWC(half* output, const uint64_t* masks,
-                            const float* values, int n, cudaStream_t stream);
+template <typename T>
+void expandPlanes_NHWC(T* output, const uint64_t* masks, const T* values, int n,
+                       cudaStream_t stream);
 
-void expandPlanes_Fp16_NCHW(half* output, const uint64_t* masks,
-                            const float* values, int n, cudaStream_t stream);
+template <typename T>
+void expandPlanes_NCHW(T* output, const uint64_t* masks, const T* values, int n,
+                       cudaStream_t stream);
 
 // Perform global avg pool.
 template <typename T>
 void globalAvgPool(int N, int C, T* output, const T* input,
-                   const T* prevLayerBias, bool nhwc);
+                   const T* prevLayerBias, bool nhwc, cudaStream_t steam);
 
 // Perform global scale.
 template <typename T>
 void globalScale(int N, int C, T* output, const T* input, const T* scaleBias,
                  const T* prevLayerBias, bool nhwc,
-                 ActivationFunction activation);
+                 ActivationFunction activation, cudaStream_t steam);
 
 // Perform Squeeze-and-Excitation (SE) in a single fused kernel.
 // Returns false if the fused kernel can't handle the sizes.
 bool Se_Fp16_NHWC(int N, int C, int numFc1Out, half* output, const half* skip,
                   const half* input, const half* w1, const half* b1,
                   const half* w2, const half* b2, const half* bPrev,
-                  ActivationFunction activation);
+                  ActivationFunction activation, cudaStream_t stream);
 
 template <typename T>
 void PolicyMap(int N, T* output, const T* input, const short* indices,
@@ -114,7 +114,8 @@ void PolicyMap(int N, T* output, const T* input, const short* indices,
 
 // Custom winograd helper functions
 template <typename T>
-void FilterTransform(int N, int C, T* transformedFilter, const T* filter);
+void FilterTransform(int N, int C, T* transformedFilter, const T* filter,
+                     cudaStream_t stream);
 
 template <typename T, bool nhcw>
 void InputTransform(int N, int C, T* transformedInput, const T* input,
@@ -157,5 +158,14 @@ void inputPreprocessForAttentionBody(T* output, const T* input,
 template <typename T>
 void applyInputGating(T* output, const T* input, const T* mult, const T* add,
                       int N, int HW, int C, cudaStream_t stream);
+
+template <typename T>
+void genOffsetPointers(T** offsets, int heads, int max_batch, int depth,
+                       int d_model, T* k, T* q, T* b1, T* v, T* b2,
+                       cudaStream_t stream);
+
+void fusedMHA(void* output, void* mha_q, void* mha_k, void* mha_v, void* skip,
+              int batch_size, int num_heads, int depth, cudaStream_t stream);
+
 }  // namespace cudnn_backend
 }  // namespace lczero
diff --git a/src/neural/cuda/layers.cc b/src/neural/backends/cuda/layers.cc
similarity index 96%
rename from src/neural/cuda/layers.cc
rename to src/neural/backends/cuda/layers.cc
index 8543443897..5ae5b7f7dc 100644
--- a/src/neural/cuda/layers.cc
+++ b/src/neural/backends/cuda/layers.cc
@@ -33,8 +33,7 @@
 #include "cuda_common.h"
 #include "kernels.h"
 #include "neural/network.h"
-#include "neural/shared/activation.h"
-#include "neural/shared/attention_policy_map.h"
+#include "neural/tables/attention_policy_map.h"
 #include "utils/fp16_utils.h"
 
 namespace lczero {
@@ -220,7 +219,7 @@ void ConvLayer<half>::LoadWeights(float* pfilter, float* pBias, void* scratch) {
 
   if (nhwc_) {
     convertNCHWtoNHWC((half*)weights, (float*)scratch, C, c_input_, C, c_input_,
-                      filter_size_, filter_size_);
+                      filter_size_, filter_size_, 0);
   } else {
     copyTypeConverted((half*)weights, (float*)scratch,
                       C * c_input_ * filter_size_ * filter_size_, 0);
@@ -496,7 +495,7 @@ void SELayer<float>::Eval(int N, float* output, const float* input,
 
   // 1. Global avg pooling (also adds previous layer bias before computing
   // averages).
-  globalAvgPool(N, C, op2, input, bPrev_, false);
+  globalAvgPool(N, C, op2, input, bPrev_, false, stream);
 
   // 2. First fully connected layer.
   float alpha = 1.0f, beta = 0.0f;
@@ -515,7 +514,7 @@ void SELayer<float>::Eval(int N, float* output, const float* input,
 
   // 4. (Optional prev layer bias add), Global scale, residual add, relu and
   // bias.
-  globalScale(N, C, output, input, op2, bPrev_, false, act_);
+  globalScale(N, C, output, input, op2, bPrev_, false, act_, stream);
 }
 
 template <>
@@ -526,7 +525,7 @@ void SELayer<half>::Eval(int N, half* output, const half* input,
   bool se_done = false;
   if (kUseFusedSELayer && nhwc_) {
     se_done = Se_Fp16_NHWC(N, C, numFc1Out_, output, input2, input, w1_t_, b1_,
-                           w2_t_, b2_, bPrev_, act_);
+                           w2_t_, b2_, bPrev_, act_, stream);
   }
   if (!se_done) {
     assert(output == input2);
@@ -536,7 +535,7 @@ void SELayer<half>::Eval(int N, half* output, const half* input,
 
     // 1. Global avg pooling (also adds previous layer bias before computing
     // averages).
-    globalAvgPool(N, C, op2, input, bPrev_, nhwc_);
+    globalAvgPool(N, C, op2, input, bPrev_, nhwc_, stream);
 
     // 2. First fully connected layer.
     __half_raw one_h{0x3C00};
@@ -558,7 +557,7 @@ void SELayer<half>::Eval(int N, half* output, const half* input,
 
     // 4. (Optional prev layer bias add), Global scale, residual add, relu and
     // bias.
-    globalScale(N, C, output, input, op2, bPrev_, nhwc_, act_);
+    globalScale(N, C, output, input, op2, bPrev_, nhwc_, act_, stream);
   }
 }
 
@@ -594,7 +593,7 @@ void FCLayer<half>::LoadWeights(float* cpuWeight, float* cpuBias,
   if (nhwc_) {
     convertNCHWtoNHWC((half*)weights_, (float*)scratch, (int)num_biases,
                       input_->GetC(), (int)num_biases, input_->GetC(),
-                      input_->GetH(), input_->GetW());
+                      input_->GetH(), input_->GetW(), 0);
   } else {
     copyTypeConverted((half*)weights_, (float*)scratch, (int)num_weights, 0);
   }
@@ -852,7 +851,7 @@ void FusedWinogradConvSELayer<DataType>::LoadWeights(float* pfilter,
   }
 
   // run winograd transform kernel for the filter
-  FilterTransform(C, c_input_, transformed_weights_, weights);
+  FilterTransform(C, c_input_, transformed_weights_, weights, 0);
 }
 
 // TODO: Do this on the GPU to improve network load time!
@@ -1201,7 +1200,7 @@ void ResidualBlock<DataType>::LoadWeights0(float* pfilter, float* pBias,
   }
 
   // run winograd transform kernel for the filter
-  FilterTransform(C, c_input_, transformed_weights0_, weights);
+  FilterTransform(C, c_input_, transformed_weights0_, weights, 0);
 }
 
 template <typename DataType>
@@ -1227,7 +1226,7 @@ void ResidualBlock<DataType>::LoadWeights1(float* pfilter, float* pBias,
   }
 
   // run winograd transform kernel for the filter
-  FilterTransform(C, C, transformed_weights1_, weights);
+  FilterTransform(C, C, transformed_weights1_, weights, 0);
 }
 
 template <typename DataType>
@@ -1423,7 +1422,7 @@ template <typename DataType>
 AttentionPolicyHead<DataType>::AttentionPolicyHead(
     BaseLayer<DataType>* ip, const MultiHeadWeights::PolicyHead& weights,
     void* scratch, bool attention_body, ActivationFunction act,
-    int max_batch_size)
+    int max_batch_size, bool use_gemm_ex)
     : BaseLayer<DataType>(64 * 64 + 24 * 8, 1, 1, ip),
       attention_body_(attention_body),
       // Old networks without attention body (e.g. T79) use hardcoded SELU
@@ -1475,8 +1474,9 @@ AttentionPolicyHead<DataType>::AttentionPolicyHead(
         nullptr, 0,  // smolgen weights not implemented in
                      // policy encoder heads yet.
         max_batch_size, ACTIVATION_SWISH, act_,
-        1e-6);  // attentionbody nets don't have policy encoders, so using old
-                // epsilon for backward compatibility with T78.
+        1e-6,          // attentionbody nets don't have policy encoders, so
+        use_gemm_ex,   // using old epsilon for backward compatibility with T78.
+        false);
     encoder_weights_.emplace_back(pW);
   }
 }
@@ -1486,7 +1486,8 @@ EncoderBlock<DataType>::EncoderBlock(
     const MultiHeadWeights::EncoderLayer& cpu_weights, void* scratch, int heads,
     int size, float alpha, DataType* smolgen_global_scratch,
     int smolgen_global_size, int max_batch_size, ActivationFunction smolgen_act,
-    ActivationFunction ffn_act, float default_eps)
+    ActivationFunction ffn_act, float default_eps, bool use_gemm_ex,
+    bool fused_mha)
     : embedding_op_size_(size),
       encoder_heads_(heads),
       alpha_(alpha),
@@ -1494,7 +1495,9 @@ EncoderBlock<DataType>::EncoderBlock(
       has_smolgen_(cpu_weights.mha.has_smolgen),
       smolgen_activation_(smolgen_act),
       ffn_activation_(ffn_act),
-      max_batch_size_(max_batch_size) {
+      max_batch_size_(max_batch_size),
+      use_fused_mha_(fused_mha),
+      use_gemm_ex_(use_gemm_ex) {
   mha_q_size_ = cpu_weights.mha.q_b.size();
   mha_k_size_ = cpu_weights.mha.k_b.size();
   mha_v_size_ = cpu_weights.mha.v_b.size();
@@ -1606,7 +1609,8 @@ static void cublasXGemmStridedBatched(
     cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
     int m, int n, int k, float alpha, const void* A, int lda,
     long long int strideA, const void* B, int ldb, long long int strideB,
-    float beta, void* C, int ldc, long long int strideC, int batchCount) {
+    float beta, void* C, int ldc, long long int strideC, int batchCount,
+    bool use_gemm_ex) {
   const bool fp16 = std::is_same<half, DataType>::value;
   if (fp16) {
     unsigned short alpha_h = FP32toFP16(alpha);
@@ -1616,10 +1620,17 @@ static void cublasXGemmStridedBatched(
         B, CUDA_R_16F, ldb, strideB, &beta_h, C, CUDA_R_16F, ldc, strideC,
         batchCount, CUDA_R_16F, CUBLAS_GEMM_DEFAULT));
   } else {
-    ReportCUBLASErrors(cublasGemmStridedBatchedEx(
-        handle, transa, transb, m, n, k, &alpha, A, CUDA_R_32F, lda, strideA, B,
-        CUDA_R_32F, ldb, strideB, &beta, C, CUDA_R_32F, ldc, strideC,
-        batchCount, CUDA_R_32F, CUBLAS_GEMM_DEFAULT));
+    if (use_gemm_ex) {
+      ReportCUBLASErrors(cublasGemmStridedBatchedEx(
+          handle, transa, transb, m, n, k, &alpha, A, CUDA_R_32F, lda, strideA,
+          B, CUDA_R_32F, ldb, strideB, &beta, C, CUDA_R_32F, ldc, strideC,
+          batchCount, CUDA_R_32F, CUBLAS_GEMM_DEFAULT));
+    } else {
+      ReportCUBLASErrors(cublasSgemmStridedBatched(
+          handle, transa, transb, m, n, k, &alpha, (const float*)A, lda,
+          strideA, (const float*)B, ldb, strideB, &beta, (float*)C, ldc,
+          strideC, batchCount));
+    }
   }
 }
 
@@ -1737,7 +1748,8 @@ void EncoderBlock<DataType>::Eval(int N, DataType* in_out_tensor,
     cublasXGemmStridedBatched<DataType>(
         cublas, CUBLAS_OP_T, CUBLAS_OP_N, num_outputs, batch, num_inputs, 1.0f,
         mha_qkv_w, num_inputs, num_inputs * num_outputs, in_out_tensor,
-        num_inputs, 0, 0.0f, mha_q, num_outputs, num_outputs * max_batch, 3);
+        num_inputs, 0, 0.0f, mha_q, num_outputs, num_outputs * max_batch, 3,
+        use_gemm_ex_);
     addBiasBatched<DataType>(mha_q, mha_q, mha_qkv_b, 3, batch, num_outputs,
                              max_batch, ACTIVATION_NONE, stream);
   }
@@ -1761,31 +1773,33 @@ void EncoderBlock<DataType>::Eval(int N, DataType* in_out_tensor,
   // shape(k)[-1] = depth
   float factor = 1.0f / sqrt((float)depth);
 
+#ifdef USE_CUTLASS
+  if (use_fused_mha_) {
+    // TODO: check if we need skip in a different tensor than same tensor as
+    // output!
+    fusedMHA(buffer2, mha_q, mha_k, mha_v, has_smolgen_ ? buffer2 : nullptr, N,
+             encoder_heads_, depth, stream);
+  } else
+#endif
   // matmul_qk = tf.matmul(q, k, transpose_b=True)
   {
     if (*offset_pointers == nullptr) {
-      std::vector<DataType*> offsets(encoder_heads_ * max_batch_size_ * 5);
-      for (int i = 0; i < encoder_heads_ * max_batch_size_; i++) {
-        int h = i % encoder_heads_;
-        int n = i / encoder_heads_;
-        offsets[i] = mha_k + h * depth + 64 * d_model * n;
-        offsets[i + encoder_heads_ * max_batch_size_] =
-            mha_q + h * depth + 64 * d_model * n;
-        offsets[i + 2 * encoder_heads_ * max_batch_size_] =
-            buffer1 + i * 64 * 64;
-        offsets[i + 3 * encoder_heads_ * max_batch_size_] =
-            mha_v + h * depth + 64 * d_model * n;
-        offsets[i + 4 * encoder_heads_ * max_batch_size_] =
-            buffer2 + h * depth + 64 * d_model * n;
-      }
+#ifndef NDEBUG
+      cudaStreamCaptureStatus capture;
+      ReportCUDAErrors(cudaStreamIsCapturing(stream, &capture));
+      assert(capture !=
+                 cudaStreamCaptureStatus::cudaStreamCaptureStatusActive &&
+             "Stream capture is active, cannot allocate memory for offset "
+             "pointers");
+#endif
       ReportCUDAErrors(
           cudaMalloc((void**)offset_pointers,
                      encoder_heads_ * max_batch_size_ * 5 * sizeof(DataType*)));
-      ReportCUDAErrors(
-          cudaMemcpy(*offset_pointers, offsets.data(),
-                     encoder_heads_ * max_batch_size_ * 5 * sizeof(DataType*),
-                     cudaMemcpyHostToDevice));
+      genOffsetPointers((DataType**)*offset_pointers, encoder_heads_,
+                        max_batch_size_, depth, d_model, mha_k, mha_q, buffer1,
+                        mha_v, buffer2, stream);
     }
+
     cublasXGemmBatched<DataType>(
         cublas, CUBLAS_OP_T, CUBLAS_OP_N, 64 /*M*/, 64 /*N*/,
         depth /*K*/,  // A/B, and M/N are swapped for row-major to col-major
@@ -1806,20 +1820,18 @@ void EncoderBlock<DataType>::Eval(int N, DataType* in_out_tensor,
         64 /*LDC*/,
         // 64 * 64 /*strideC*/,
         N * encoder_heads_);
-  }
 
-  // attention_weights = tf.nn.softmax(scaled_attention_logits, axis = -1)
-  // attention_weights -> buffer1
-  if (has_smolgen_) {
-    // Add smolgen weights to the scaled matmul_qk attention logits before
-    // softmax.
-    Softmax(encoder_heads_ * N * 64, 64, buffer1, buffer1, buffer2, stream);
-  } else {
-    Softmax(encoder_heads_ * N * 64, 64, buffer1, buffer1,
-            (const DataType*)nullptr, stream);
-  }
+    // attention_weights = tf.nn.softmax(scaled_attention_logits, axis = -1)
+    // attention_weights -> buffer1
+    if (has_smolgen_) {
+      // Add smolgen weights to the scaled matmul_qk attention logits before
+      // softmax.
+      Softmax(encoder_heads_ * N * 64, 64, buffer1, buffer1, buffer2, stream);
+    } else {
+      Softmax(encoder_heads_ * N * 64, 64, buffer1, buffer1,
+              (const DataType*)nullptr, stream);
+    }
 
-  {
     cublasXGemmBatched<DataType>(
         cublas, CUBLAS_OP_N, CUBLAS_OP_N, depth /*M*/, 64 /*N*/, 64 /*K*/, 1.0f,
         *offset_pointers + encoder_heads_ * max_batch_size_ *
@@ -1893,8 +1905,10 @@ void AttentionPolicyHead<DataType>::Eval(
   DataType* buffer2 = input2_tensor + scratch_size / (2 * sizeof(DataType));
 
   int inputC = this->input_->GetC();
-  if (!attention_body_)
-    convertNCHWtoNHWC((DataType*)scratch, input, N, inputC, N, inputC, 8, 8);
+  bool input_nhwc = attention_body_ || this->input_->isNHWC();
+  if (!input_nhwc)
+    convertNCHWtoNHWC((DataType*)scratch, input, N, inputC, N, inputC, 8, 8,
+                      stream);
 
   // 1. Policy embedding (fully connected layer)
   // Input data in NHWC layout N*(64)*C, output is N*(64)*embedding_op_size_
@@ -1906,7 +1920,7 @@ void AttentionPolicyHead<DataType>::Eval(
     cublasXgemm<DataType>(cublas, CUBLAS_OP_T, CUBLAS_OP_N, num_outputs, batch,
                           num_inputs, 1.0f, (const DataType*)ip_pol_w_,
                           num_inputs,
-                          attention_body_ ? input : (DataType*)scratch,
+                          input_nhwc ? input : (DataType*)scratch,
                           num_inputs, 0.0f, pol_embedding, num_outputs);
     addBiasBatched(pol_embedding, pol_embedding, ip_pol_b_, 1, batch,
                    num_outputs, act_, stream);
@@ -1930,7 +1944,7 @@ void AttentionPolicyHead<DataType>::Eval(
     cublasXGemmStridedBatched<DataType>(
         cublas, CUBLAS_OP_T, CUBLAS_OP_N, num_outputs, batch, num_inputs, 1.0f,
         wqk_w_, num_inputs, num_inputs * num_outputs, input2_tensor, num_inputs,
-        0, 0.0f, wq, num_outputs, num_outputs * batch, 2);
+        0, 0.0f, wq, num_outputs, num_outputs * batch, 2, use_gemm_ex_);
 
     addBiasBatched<DataType>(wq, wq, wqk_b_, 2, batch, num_outputs,
                              ACTIVATION_NONE, stream);
@@ -1953,7 +1967,7 @@ void AttentionPolicyHead<DataType>::Eval(
         wk /*A*/, policy_d_model_ /*LDA*/, 64 * policy_d_model_, /*strideA*/
         wq /*B*/, policy_d_model_ /*LDB*/, 64 * policy_d_model_, /*strideB*/
         0.0f, output /*C*/,  // output (policy_attn_logits)
-        64 /*LDC*/, 64 * 64 + 8 * 24 /*strideC*/, N);
+        64 /*LDC*/, 64 * 64 + 8 * 24 /*strideC*/, N, use_gemm_ex_);
   }
 
   // Compute promotion_logits in a single kernel (and put the result just after
@@ -2046,8 +2060,10 @@ AttentionBody<DataType>::AttentionBody(const MultiHeadWeights& weights,
                                        void* scratch, Activations activations,
                                        int num_res_blocks, int input_c,
                                        int max_batch_size,
-                                       bool is_pe_dense_embedding)
-    : BaseLayer<DataType>(weights.ip_emb_b.size(), 8, 8, nullptr),
+                                       bool is_pe_dense_embedding,
+                                       bool use_gemm_ex, bool fused_mha)
+    : BaseLayer<DataType>(weights.ip_emb_b.size(), 8, 8, nullptr, false,
+                          use_gemm_ex),
       embedding_op_size_(weights.ip_emb_b.size()),
       encoder_head_count_(weights.encoder_head_count),
       activations_(activations),
@@ -2056,7 +2072,8 @@ AttentionBody<DataType>::AttentionBody(const MultiHeadWeights& weights,
       has_gating_(weights.ip_mult_gate.size() > 0 &&
                   weights.ip_add_gate.size() > 0),
       has_smolgen_(weights.has_smolgen),
-      is_pe_dense_embedding_(is_pe_dense_embedding) {
+      is_pe_dense_embedding_(is_pe_dense_embedding),
+      use_fused_mha_(fused_mha) {
   allocAndUpload<DataType>(&ip_emb_w_, weights.ip_emb_w, scratch);
   allocAndUpload<DataType>(&ip_emb_b_, weights.ip_emb_b, scratch);
 
@@ -2111,7 +2128,7 @@ AttentionBody<DataType>::AttentionBody(const MultiHeadWeights& weights,
         enc, scratch, encoder_head_count_, embedding_op_size_, alpha,
         smolgen_global_, smolgen_global_size_, max_batch_size,
         activations_.smolgen_activation, activations_.ffn_activation,
-        is_pe_dense_embedding_ ? 1e-3 : 1e-6);
+        is_pe_dense_embedding_ ? 1e-3 : 1e-6, use_gemm_ex, use_fused_mha_);
     encoder_weights_.emplace_back(pW);
   }
 }
@@ -2173,7 +2190,8 @@ void AttentionBody<DataType>::Eval(int N, DataType* output,
       const int num_inputs = 64 * 12;
       const int batch = N;
 
-      convertNCHWtoNHWC((DataType*)scratch, input, N, inputC, N, 12, 8, 8);
+      convertNCHWtoNHWC((DataType*)scratch, input, N, inputC, N, 12, 8, 8,
+                        stream);
       cublasXgemm<DataType>(
           cublas, CUBLAS_OP_T, CUBLAS_OP_N, num_outputs, batch, num_inputs,
           1.0f, (const DataType*)ip_emb_pre_w_, num_inputs,
@@ -2208,7 +2226,8 @@ void AttentionBody<DataType>::Eval(int N, DataType* output,
     // #redirect flow through encoder blocks
     // flow = tf.transpose(flow, perm = [ 0, 2, 3, 1 ])
     // flow = tf.reshape(flow, [ -1, 64, self.RESIDUAL_FILTERS ])
-    convertNCHWtoNHWC((DataType*)scratch, input, N, inputC, N, inputC, 8, 8);
+    convertNCHWtoNHWC((DataType*)scratch, input, N, inputC, N, inputC, 8, 8,
+                      stream);
   }
 
   if (is_pe_dense_embedding_) {
@@ -2440,6 +2459,7 @@ void CudnnError(cudnnStatus_t status, const char* file, const int& line) {
     char message[128];
     sprintf(message, "CUDNN error: %s (%s:%d) ", cudnnGetErrorString(status),
             file, line);
+    CERR << message;
     throw Exception(message);
   }
 }
@@ -2476,6 +2496,7 @@ void CublasError(cublasStatus_t status, const char* file, const int& line) {
     char message[128];
     sprintf(message, "CUBLAS error: %s (%s:%d) ", CublasGetErrorString(status),
             file, line);
+    CERR << message;
     throw Exception(message);
   }
 }
@@ -2485,6 +2506,7 @@ void CudaError(cudaError_t status, const char* file, const int& line) {
     char message[128];
     sprintf(message, "CUDA error: %s (%s:%d) ", cudaGetErrorString(status),
             file, line);
+    CERR << message;
     throw Exception(message);
   }
 }
diff --git a/src/neural/cuda/layers.h b/src/neural/backends/cuda/layers.h
similarity index 96%
rename from src/neural/cuda/layers.h
rename to src/neural/backends/cuda/layers.h
index de563f9346..5c5ec871c1 100644
--- a/src/neural/cuda/layers.h
+++ b/src/neural/backends/cuda/layers.h
@@ -29,10 +29,11 @@
 #include <cublas_v2.h>
 
 #include <cstddef>
+#include <memory>
 
 #include "cuda_common.h"
 #include "neural/network_legacy.h"
-#include "neural/shared/activation.h"
+#include "neural/tables/activation_function.h"
 
 #ifdef USE_CUDNN
 #include <cudnn.h>
@@ -340,7 +341,8 @@ class EncoderBlock {
                int heads, int size, float alpha,
                DataType* smolgen_global_scratch, int smolgen_global_size,
                int max_batch_size, ActivationFunction smolgen_act,
-               ActivationFunction ffn_act, float default_eps);
+               ActivationFunction ffn_act, float default_eps, bool use_gemm_ex,
+               bool fused_mha);
   ~EncoderBlock();
 
   void Eval(int N, DataType* inpop, DataType* scratch0, DataType* scratch1,
@@ -393,6 +395,8 @@ class EncoderBlock {
   int smol_global_size_;
 
   const int max_batch_size_;
+  const bool use_fused_mha_;
+  const bool use_gemm_ex_;
 };
 
 // The Attention policy head implementation
@@ -406,12 +410,14 @@ class AttentionPolicyHead : public BaseLayer<DataType> {
   using BaseLayer<DataType>::GetC;
   using BaseLayer<DataType>::GetH;
   using BaseLayer<DataType>::GetW;
+  using BaseLayer<DataType>::use_gemm_ex_;
 
  public:
   AttentionPolicyHead(BaseLayer<DataType>* ip,
                       const MultiHeadWeights::PolicyHead& weights,
                       void* scratch, bool attention_body,
-                      ActivationFunction act, int max_batch_size);
+                      ActivationFunction act, int max_batch_size,
+                      bool use_gemm_ex);
   ~AttentionPolicyHead();
   void Eval(int N, DataType* output, const DataType* input,
             const DataType* input2, void* scratch, size_t scratch_size,
@@ -476,7 +482,8 @@ class AttentionBody : public BaseLayer<DataType> {
  public:
   AttentionBody(const MultiHeadWeights& weights, void* scratch,
                 Activations activations, int num_res_blocks, int input_c,
-                int max_batch_size, bool is_pe_dense_embedding);
+                int max_batch_size, bool is_pe_dense_embedding,
+                bool use_gemm_ex, bool fused_mha);
   ~AttentionBody();
   void Eval(int N, DataType* output, const DataType* input,
             const DataType* input2, void* scratch, size_t scratch_size,
@@ -507,6 +514,7 @@ class AttentionBody : public BaseLayer<DataType> {
   const bool has_gating_;
   const bool has_smolgen_;
   bool is_pe_dense_embedding_;  // flag for dense position encoding
+  const bool use_fused_mha_;
 };
 
 // The value head implementation
@@ -523,8 +531,8 @@ class ValueHead : public BaseLayer<DataType> {
 
  public:
   ValueHead(BaseLayer<DataType>* ip, const MultiHeadWeights::ValueHead& weights,
-            void* scratch, bool attention_body, bool wdl, ActivationFunction act,
-            int max_batch_size, bool use_gemm_ex);
+            void* scratch, bool attention_body, bool wdl,
+            ActivationFunction act, int max_batch_size, bool use_gemm_ex);
   ~ValueHead();
   void Eval(int N, DataType* output, const DataType* input,
             const DataType* input2, void* scratch, size_t scratch_size,
@@ -548,6 +556,5 @@ class ValueHead : public BaseLayer<DataType> {
   ActivationFunction act_;
 };
 
-
 }  // namespace cudnn_backend
 }  // namespace lczero
diff --git a/src/neural/cuda/network_cuda.cc b/src/neural/backends/cuda/network_cuda.cc
similarity index 71%
rename from src/neural/cuda/network_cuda.cc
rename to src/neural/backends/cuda/network_cuda.cc
index cf67d1336c..85c80ce2e8 100644
--- a/src/neural/cuda/network_cuda.cc
+++ b/src/neural/backends/cuda/network_cuda.cc
@@ -26,10 +26,10 @@
 */
 #include <algorithm>
 #include <cassert>
-#include <functional>
 #include <list>
 #include <memory>
 #include <mutex>
+#include <type_traits>
 
 #include "cuda_common.h"
 #include "inputs_outputs.h"
@@ -37,11 +37,19 @@
 #include "layers.h"
 #include "neural/factory.h"
 #include "neural/network_legacy.h"
-#include "neural/shared/activation.h"
-#include "neural/shared/attention_policy_map.h"
-#include "neural/shared/policy_map.h"
-#include "utils/bititer.h"
+#include "neural/tables/attention_policy_map.h"
+#include "neural/tables/policy_map.h"
 #include "utils/exception.h"
+#include "utils/fp16_utils.h"
+#include "utils/trace.h"
+
+#if CUDART_VERSION >= 11010
+#define CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS 1
+#else
+#define CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS 0
+#undef cudaEventWaitExternal
+#undef cudaEventRecordExternal
+#endif
 
 namespace lczero {
 using namespace cudnn_backend;
@@ -121,8 +129,8 @@ static size_t getMaxAttentionBodySize(const MultiHeadWeights& weights, int N) {
 template <typename DataType>
 class CudaNetworkComputation : public NetworkComputation {
  public:
-  CudaNetworkComputation(CudaNetwork<DataType>* network,
-                         bool wdl, bool moves_left);
+  CudaNetworkComputation(CudaNetwork<DataType>* network, bool wdl,
+                         bool moves_left);
   ~CudaNetworkComputation();
 
   void AddInput(InputPlanes&& input) override {
@@ -131,11 +139,11 @@ class CudaNetworkComputation : public NetworkComputation {
     const auto iter_val =
         &inputs_outputs_->input_val_mem_[batch_size_ * kInputPlanes];
 
-    int i = 0;
-    for (const auto& plane : input) {
+    assert(input.size() == kInputPlanes);
+    for (int i = 0; i < kInputPlanes; i++) {
+      const auto& plane = input[i];
       iter_mask[i] = plane.mask;
-      iter_val[i] = plane.value;
-      i++;
+      ToType(iter_val[i], plane.value);
     }
 
     batch_size_++;
@@ -143,38 +151,47 @@ class CudaNetworkComputation : public NetworkComputation {
 
   void ComputeBlocking() override;
 
+  void CaptureGraph(std::unique_lock<std::mutex>&& lock = {});
+
   int GetBatchSize() const override { return batch_size_; }
 
   float GetQVal(int sample) const override {
     if (wdl_) {
-      auto w = inputs_outputs_->op_value_mem_[3 * sample + 0];
-      auto l = inputs_outputs_->op_value_mem_[3 * sample + 2];
-      return w - l;
+      const float* wdl =
+          sizeof(inputs_outputs_->op_value_mem_[0]) == sizeof(float)
+              ? (float*)inputs_outputs_->op_value_mem_
+              : inputs_outputs_->wdl_cpu_softmax_.get();
+      return wdl[2 * sample];
     }
-    return inputs_outputs_->op_value_mem_[sample];
+    return FromType(inputs_outputs_->op_value_mem_[sample]);
   }
 
   float GetDVal(int sample) const override {
     if (wdl_) {
-      return inputs_outputs_->op_value_mem_[3 * sample + 1];
+      const float* wdl =
+          sizeof(inputs_outputs_->op_value_mem_[0]) == sizeof(float)
+              ? (float*)inputs_outputs_->op_value_mem_
+              : inputs_outputs_->wdl_cpu_softmax_.get();
+      return wdl[2 * sample + 1];
     }
     return 0.0f;
   }
 
   float GetPVal(int sample, int move_id) const override {
-    return inputs_outputs_->op_policy_mem_[sample * kNumOutputPolicy + move_id];
+    return FromType(
+        inputs_outputs_->op_policy_mem_[sample * kNumOutputPolicy + move_id]);
   }
 
   float GetMVal(int sample) const override {
     if (moves_left_) {
-      return inputs_outputs_->op_moves_left_mem_[sample];
+      return FromType(inputs_outputs_->op_moves_left_mem_[sample]);
     }
     return 0.0f;
   }
 
  private:
   // Memory holding inputs, outputs.
-  std::unique_ptr<InputsOutputs> inputs_outputs_;
+  std::unique_ptr<InputsOutputs<DataType>> inputs_outputs_;
   int batch_size_;
   bool wdl_;
   bool moves_left_;
@@ -191,6 +208,7 @@ class CudaNetwork : public Network {
                       file.format().network_format().moves_left()} {
     MultiHeadWeights weights(file.weights());
     gpu_id_ = options.GetOrDefault<int>("gpu", 0);
+    enable_graph_capture_ = options.GetOrDefault<bool>("graph_capture", true);
 
     const auto nf = file.format().network_format();
     using NF = pblczero::NetworkFormat;
@@ -211,6 +229,10 @@ class CudaNetwork : public Network {
 
     showInfo();
 
+#ifdef USE_CUTLASS
+    CERR << "Compiled with CUTLASS enabled";
+#endif
+
     int total_gpus;
     ReportCUDAErrors(cudaGetDeviceCount(&total_gpus));
 
@@ -219,7 +241,7 @@ class CudaNetwork : public Network {
 
     cudaDeviceProp deviceProp = {};
     cudaGetDeviceProperties(&deviceProp, gpu_id_);
-    showDeviceInfo(deviceProp);
+    showDeviceInfo(deviceProp, gpu_id_);
 
     l2_cache_size_ = deviceProp.l2CacheSize;
     sm_count_ = deviceProp.multiProcessorCount;
@@ -256,7 +278,16 @@ class CudaNetwork : public Network {
     }
 
     if (!multi_stream_) {
+      ReportCUDAErrors(
+          cudaStreamCreateWithFlags(&compute_stream_, cudaStreamNonBlocking));
+      ReportCUDAErrors(
+          cudaStreamCreateWithFlags(&upload_stream_, cudaStreamNonBlocking));
+      ReportCUDAErrors(
+          cudaStreamCreateWithFlags(&download_stream_, cudaStreamNonBlocking));
+      ReportCUDAErrors(cudaEventCreateWithFlags(&compute_ordering_event_,
+                                                cudaEventDisableTiming));
       ReportCUBLASErrors(cublasCreate(&cublas_));
+      ReportCUBLASErrors(cublasSetStream(cublas_, compute_stream_));
       if (has_tensor_cores_)
         ReportCUBLASErrors(cublasSetMathMode(
             cublas_,
@@ -307,10 +338,15 @@ class CudaNetwork : public Network {
       use_res_block_winograd_fuse_opt_ = false;
     }
     // Override if set in backend-opts.
-    if (!options.IsDefault<bool>("res_block_fusing")) {
+    if (options.Exists<bool>("res_block_fusing")) {
       use_res_block_winograd_fuse_opt_ = options.Get<bool>("res_block_fusing");
     }
 
+    bool use_fused_mha = false;
+    if (deviceProp.major >= 8 && fp16) {
+      use_fused_mha = options.GetOrDefault<bool>("fused_mha", true);
+    }
+
     const bool use_gemm_ex = deviceProp.major >= 5;
 
     // 0. Check for SE.
@@ -343,14 +379,14 @@ class CudaNetwork : public Network {
     std::string policy_head =
         options.GetOrDefault<std::string>("policy_head", "vanilla");
     // Check that selected policy head exists.
-    if (weights.policy_heads.count(policy_head) == 0) {
+    if (!weights.policy_heads.contains(policy_head)) {
       throw Exception("The policy head you specified '" + policy_head +
                       "' does not exist in this net.");
     }
     std::string value_head =
         options.GetOrDefault<std::string>("value_head", "winner");
     // Check that selected value head exists.
-    if (weights.value_heads.count(value_head) == 0) {
+    if (!weights.value_heads.contains(value_head)) {
       throw Exception("The value head you specified '" + value_head +
                       "' does not exist in this net.");
     }
@@ -458,7 +494,8 @@ class CudaNetwork : public Network {
           numBlocks_ > 0 ? kNumFilters : kInputPlanes, max_batch_size_,
           static_cast<InputEmbedding>(
               file.format().network_format().input_embedding()) ==
-              InputEmbedding::INPUT_EMBEDDING_PE_DENSE);
+              InputEmbedding::INPUT_EMBEDDING_PE_DENSE,
+          use_gemm_ex, use_fused_mha);
       network_.emplace_back(std::move(attention_body));
 
       encoder_last_ = getLastLayer();
@@ -470,7 +507,7 @@ class CudaNetwork : public Network {
       if (attn_policy_) {
         auto AttentionPolicy = std::make_unique<AttentionPolicyHead<DataType>>(
             getLastLayer(), head, scratch_mem_, attn_body_, act,
-            max_batch_size_);
+            max_batch_size_, use_gemm_ex);
         network_.emplace_back(std::move(AttentionPolicy));
 
         auto policymap = std::make_unique<PolicyMapLayer<DataType>>(
@@ -530,8 +567,8 @@ class CudaNetwork : public Network {
              pblczero::NetworkFormat::VALUE_WDL;
       BaseLayer<DataType>* lastlayer = attn_body_ ? encoder_last_ : resi_last_;
       auto value_main = std::make_unique<ValueHead<DataType>>(
-          lastlayer, head, scratch_mem_, attn_body_, wdl_, act,
-          max_batch_size_, use_gemm_ex);
+          lastlayer, head, scratch_mem_, attn_body_, wdl_, act, max_batch_size_,
+          use_gemm_ex);
       network_.emplace_back(std::move(value_main));
     }
 
@@ -592,18 +629,86 @@ class CudaNetwork : public Network {
 
     tensor_mem_size_ = multi_stream_ ? maxSize : 0;
 
-    // pre-allocate one InputsOutputs object
-    // The first call to allocate memory, create cublas,
-    // strem, etc takes really long (600 ms)
-    std::unique_ptr<InputsOutputs> io = GetInputsOutputs();
+    // pre-allocate cuda graphs for search threads
+    auto allocateCudaGraphs = [&] {
+      ReportCUDAErrors(cudaSetDevice(gpu_id_));
+      CudaNetworkComputation<DataType> comp(this, wdl_, moves_left_);
+      comp.AddInput(InputPlanes{(size_t)kNumInputPlanes});
+      // Make sure cublas is initialized in this thread.
+      comp.ComputeBlocking();
+      for (int i = 0; i < GetMiniBatchSize(); i++) {
+        comp.AddInput(InputPlanes{(size_t)kNumInputPlanes});
+        auto lock = LockEval();
+        comp.CaptureGraph(std::move(lock));
+      }
+    };
+    std::thread t2(allocateCudaGraphs);
+    allocateCudaGraphs();
+    t2.join();
+  }
+
+  std::unique_lock<std::mutex> LockEval() {
+    if (multi_stream_) {
+      return {};
+    } else {
+      return std::unique_lock<std::mutex>{lock_};
+    }
+  }
+
+  bool GetGraphCaptureEnabled() const { return enable_graph_capture_; }
+
+  CudaGraphCapture<DataType> BeginCapture(InputsOutputs<DataType>& io) {
+    if (!multi_stream_) {
+#if CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS
+      return {io, upload_stream_, download_stream_};
+#else
+      return {io, compute_stream_, download_stream_};
+#endif
+    } else {
+      return {io, io.upload_stream_, io.download_stream_};
+    }
   }
 
-  void forwardEval(InputsOutputs* io, int batchSize) {
+  void UploadInputs(InputsOutputs<DataType>* io, int batchSize) {
+    // Multu-stream can capture uploads without external events.
+    if (multi_stream_) return;
+    ReportCUDAErrors(
+        cudaMemcpyAsync(io->input_masks_mem_gpu_, io->input_masks_mem_,
+                        batchSize * kInputPlanes * sizeof(uint64_t),
+                        cudaMemcpyHostToDevice, upload_stream_));
+    ReportCUDAErrors(cudaMemcpyAsync(
+        io->input_val_mem_gpu_, io->input_val_mem_,
+        batchSize * kInputPlanes * sizeof(io->input_val_mem_[0]),
+        cudaMemcpyHostToDevice, upload_stream_));
+    ReportCUDAErrors(cudaEventRecord(io->upload_done_event_, upload_stream_));
+    ReportCUDAErrors(
+        cudaStreamWaitEvent(compute_stream_, io->upload_done_event_, 0));
+  }
+
+  void GraphLaunch(InputsOutputs<DataType>* io, int batchSize) {
+#if CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS
+    io->cuda_graphs_[batchSize - 1].Launch(io->exec_stream_);
+#else
+    if (!multi_stream_) {
+      UploadInputs(io, batchSize);
+
+      io->cuda_graphs_[batchSize - 1].Launch(compute_stream_);
+      ReportCUDAErrors(
+          cudaEventRecord(io->download_done_event_, compute_stream_));
+    } else {
+      io->cuda_graphs_[batchSize - 1].Launch(io->exec_stream_);
+      ReportCUDAErrors(
+          cudaEventRecord(io->download_done_event_, io->exec_stream_));
+    }
+#endif
+  }
+
+  void forwardEval(InputsOutputs<DataType>* io, int batchSize,
+                   [[maybe_unused]] bool capture = false) {
     // It is safe to evaluate larger than the batchSize
     // as all buffers are designed to handle max_batch_size
     // and the extra invalid results are never read.
     if (batchSize < min_batch_size_) batchSize = min_batch_size_;
-    if (!multi_stream_) lock_.lock();
 
 #ifdef DEBUG_RAW_NPS
     auto t_start = std::chrono::high_resolution_clock::now();
@@ -611,13 +716,13 @@ class CudaNetwork : public Network {
 
     // Expand packed planes to full planes.
     uint64_t* ipDataMasks = io->input_masks_mem_gpu_;
-    float* ipDataValues = io->input_val_mem_gpu_;
+    auto* ipDataValues = io->input_val_mem_gpu_;
 
     DataType* tensor_mem[3];
     void* scratch_mem;
     DataType*** offset_pointers;
     DataType*** head_offset_pointers;
-    cudaStream_t stream;
+    cudaStream_t compute_stream, upload_stream, download_stream;
     cublasHandle_t cublas;
     if (multi_stream_) {
       // We use tensor and scratch memory from InputOutputs (so that multiple
@@ -626,29 +731,49 @@ class CudaNetwork : public Network {
       scratch_mem = io->scratch_mem_;
       offset_pointers = (DataType***)&io->offset_pointers_;
       head_offset_pointers = (DataType***)&io->head_offset_pointers_;
-      stream = io->stream_;
+      compute_stream = io->compute_stream_;
+      upload_stream = io->upload_stream_;
+      download_stream = io->download_stream_;
       cublas = io->cublas_;
     } else {
       for (int i = 0; i < 3; i++) tensor_mem[i] = tensor_mem_[i];
       scratch_mem = scratch_mem_;
       offset_pointers = (DataType***)&offset_pointers_;
       head_offset_pointers = (DataType***)&head_offset_pointers_;
-      stream = 0;  // default stream
+      compute_stream = compute_stream_;
+      upload_stream = upload_stream_;
+      download_stream = download_stream_;
       cublas = cublas_;
     }
 
-    bool fp16 = std::is_same<half, DataType>::value;
-    if (fp16) {
-      expandPlanes_Fp16_NCHW((half*)(tensor_mem[0]), ipDataMasks, ipDataValues,
-                             batchSize * kInputPlanes, stream);
-    } else {
-      expandPlanes_Fp32_NCHW((float*)(tensor_mem[0]), ipDataMasks, ipDataValues,
-                             batchSize * kInputPlanes, stream);
+    if (multi_stream_ || CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS) {
+      ReportCUDAErrors(
+          cudaMemcpyAsync(io->input_masks_mem_gpu_, io->input_masks_mem_,
+                          batchSize * kInputPlanes * sizeof(uint64_t),
+                          cudaMemcpyHostToDevice, upload_stream));
+      ReportCUDAErrors(cudaMemcpyAsync(
+          io->input_val_mem_gpu_, io->input_val_mem_,
+          batchSize * kInputPlanes * sizeof(io->input_val_mem_[0]),
+          cudaMemcpyHostToDevice, upload_stream));
+      ReportCUDAErrors(cudaEventRecord(io->upload_done_event_, upload_stream));
+      ReportCUDAErrors(
+          cudaStreamWaitEvent(compute_stream, io->upload_done_event_, 0));
     }
 
-    float* opPol = io->op_policy_mem_gpu_;
-    float* opVal = io->op_value_mem_gpu_;
-    float* opMov = io->op_moves_left_mem_gpu_;
+    if (!multi_stream_) {
+#if CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS
+      ReportCUDAErrors(
+          cudaStreamWaitEvent(compute_stream, compute_ordering_event_,
+                              capture ? cudaEventWaitExternal : 0));
+#endif
+    }
+
+    expandPlanes_NCHW(tensor_mem[0], ipDataMasks, ipDataValues,
+                      batchSize * kInputPlanes, compute_stream);
+
+    auto* opPol = io->op_policy_mem_gpu_;
+    auto* opVal = io->op_value_mem_gpu_;
+    auto* opMov = io->op_moves_left_mem_gpu_;
 
     // Figure out if the memory requirment for running the res block would fit
     // in the L2 cache.
@@ -676,7 +801,8 @@ class CudaNetwork : public Network {
       // we can use a single alloc to hold all the required tensors, and enable
       // persistent L2 caching on it
       ReportCUDAErrors(cudaStreamSetAttribute(
-          stream, cudaStreamAttributeAccessPolicyWindow, &stream_attribute));
+          compute_stream, cudaStreamAttributeAccessPolicyWindow,
+          &stream_attribute));
 
       enableCacheOpt = true;
       skip_connection =
@@ -694,7 +820,7 @@ class CudaNetwork : public Network {
       // Input.
       network_[l++]->Eval(batchSize, skip_connection, tensor_mem[0], nullptr,
                           scratch_mem, scratch_size_, nullptr, cublas,
-                          stream);  // input conv
+                          compute_stream);  // input conv
 
       // Residual block.
       for (int block = 0; block < numBlocks_; block++) {
@@ -702,15 +828,15 @@ class CudaNetwork : public Network {
           network_[l++]->Eval(batchSize, tensor_mem[2], skip_connection,
                               nullptr, enableCacheOpt ? nullptr : scratch_mem,
                               scratch_size_, nullptr, cublas,
-                              stream);  // block
+                              compute_stream);  // block
         } else {
           network_[l++]->Eval(batchSize, tensor_mem[0], tensor_mem[2], nullptr,
                               scratch_mem, scratch_size_, nullptr, cublas,
-                              stream);  // conv1
+                              compute_stream);  // conv1
 
           network_[l++]->Eval(batchSize, tensor_mem[2], tensor_mem[0],
                               tensor_mem[2], scratch_mem, scratch_size_,
-                              nullptr, cublas, stream);  // conv2
+                              nullptr, cublas, compute_stream);  // conv2
         }
       }
 
@@ -724,7 +850,7 @@ class CudaNetwork : public Network {
           batchSize, tensor_mem[1],
           (numBlocks_ > 0) ? tensor_mem[2] : tensor_mem[0],
           (numBlocks_ > 0) ? tensor_mem[0] : tensor_mem[2], scratch_mem,
-          scratch_size_, nullptr, cublas, stream,
+          scratch_size_, nullptr, cublas, compute_stream,
           offset_pointers);  // Entire attention body of the network
 
       flow = tensor_mem[1];
@@ -736,7 +862,8 @@ class CudaNetwork : public Network {
     if (enableCacheOpt) {
       // reset the cache settings
       stream_attribute.accessPolicyWindow.num_bytes = 0;
-      cudaStreamSetAttribute(stream, cudaStreamAttributeAccessPolicyWindow,
+      cudaStreamSetAttribute(compute_stream,
+                             cudaStreamAttributeAccessPolicyWindow,
                              &stream_attribute);
       cudaCtxResetPersistingL2Cache();
     }
@@ -746,116 +873,131 @@ class CudaNetwork : public Network {
     if (attn_policy_) {
       network_[l++]->Eval(
           batchSize, spare1, flow, spare2, scratch_mem, scratch_size_, nullptr,
-          cublas, stream,
+          cublas, compute_stream,
           head_offset_pointers);  // Entire Attention policy head except for the
                                   // policy map
-      if (fp16) {
-        network_[l++]->Eval(batchSize, spare2, spare1, nullptr, scratch_mem,
-                            scratch_size_, nullptr, cublas,
-                            stream);  // policy map layer
-        copyTypeConverted(opPol, (half*)spare2, batchSize * kNumOutputPolicy,
-                          stream);  // POLICY output
-      } else {
-        network_[l++]->Eval(batchSize, (DataType*)opPol, spare1, nullptr,
-                            scratch_mem, scratch_size_, nullptr, cublas,
-                            stream);  // policy map layer  // POLICY output
-      }
+      network_[l++]->Eval(
+          batchSize, (DataType*)opPol, spare1, nullptr, scratch_mem,
+          scratch_size_, nullptr, cublas,
+          compute_stream);  // policy map layer  // POLICY output
 
     } else if (conv_policy_) {
       network_[l++]->Eval(batchSize, spare1, flow, nullptr, scratch_mem,
                           scratch_size_, nullptr, cublas,
-                          stream);  // policy conv1
+                          compute_stream);  // policy conv1
 
       network_[l++]->Eval(batchSize, spare2, spare1, nullptr, scratch_mem,
                           scratch_size_, nullptr, cublas,
-                          stream);  // policy conv2
-
-      if (fp16) {
-        network_[l++]->Eval(batchSize, spare1, spare2, nullptr, scratch_mem,
-                            scratch_size_, nullptr, cublas,
-                            stream);  // policy map layer
-        copyTypeConverted(opPol, (half*)(spare1), batchSize * kNumOutputPolicy,
-                          stream);  // POLICY output
-      } else {
-        network_[l++]->Eval(batchSize, (DataType*)opPol, spare2, nullptr,
-                            scratch_mem, scratch_size_, nullptr, cublas,
-                            stream);  // policy map layer  // POLICY output
-      }
+                          compute_stream);  // policy conv2
+
+      network_[l++]->Eval(
+          batchSize, (DataType*)opPol, spare2, nullptr, scratch_mem,
+          scratch_size_, nullptr, cublas,
+          compute_stream);  // policy map layer  // POLICY output
     } else {
       network_[l++]->Eval(batchSize, spare1, flow, nullptr, scratch_mem,
                           scratch_size_, nullptr, cublas,
-                          stream);  // pol conv
-
-      if (fp16) {
-        network_[l++]->Eval(batchSize, spare2, spare1, nullptr, scratch_mem,
-                            scratch_size_, nullptr, cublas,
-                            stream);  // pol FC
+                          compute_stream);  // pol conv
 
-        copyTypeConverted(opPol, (half*)(spare2), batchSize * kNumOutputPolicy,
-                          stream);  // POLICY
-      } else {
-        network_[l++]->Eval(batchSize, (DataType*)opPol, spare1, nullptr,
-                            scratch_mem, scratch_size_, nullptr, cublas,
-                            stream);  // pol FC  // POLICY
-      }
+      network_[l++]->Eval(batchSize, (DataType*)opPol, spare1, nullptr,
+                          scratch_mem, scratch_size_, nullptr, cublas,
+                          compute_stream);  // pol FC  // POLICY
     }
+    ReportCUDAErrors(cudaEventRecord(io->policy_done_event_, compute_stream));
+    ReportCUDAErrors(
+        cudaStreamWaitEvent(download_stream, io->policy_done_event_, 0));
 
     // Copy policy output from device memory to host memory.
-    ReportCUDAErrors(
-        cudaMemcpyAsync(io->op_policy_mem_, io->op_policy_mem_gpu_,
-                        sizeof(float) * kNumOutputPolicy * batchSize,
-                        cudaMemcpyDeviceToHost, stream));
+    ReportCUDAErrors(cudaMemcpyAsync(
+        io->op_policy_mem_, io->op_policy_mem_gpu_,
+        sizeof(io->op_policy_mem_[0]) * kNumOutputPolicy * batchSize,
+        cudaMemcpyDeviceToHost, download_stream));
 
     // value head
-    if (fp16) {
-      network_[l++]->Eval(batchSize, spare1, flow, spare2, scratch_mem,
-                          scratch_size_, nullptr, cublas,
-                          stream);  // value head
-      copyTypeConverted(opVal, (half*)spare1, wdl_ ? 3 * batchSize : batchSize,
-                        stream);
-    } else {
-      network_[l++]->Eval(batchSize, (DataType*)opVal, flow, spare2,
-                          scratch_mem, scratch_size_, nullptr, cublas,
-                          stream);  // value head
+    network_[l++]->Eval(batchSize, (DataType*)opVal, flow, spare2, scratch_mem,
+                        scratch_size_, nullptr, cublas,
+                        compute_stream);  // value head
+    if (!moves_left_ && !multi_stream_) {
+#if CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS
+      ReportCUDAErrors(
+          cudaEventRecordWithFlags(compute_ordering_event_, compute_stream,
+                                   capture ? cudaEventRecordExternal : 0));
+#endif
+    }
+    ReportCUDAErrors(cudaEventRecord(io->value_done_event_, compute_stream));
+    ReportCUDAErrors(
+        cudaStreamWaitEvent(download_stream, io->value_done_event_, 0));
+    ReportCUDAErrors(cudaMemcpyAsync(
+        io->op_value_mem_, io->op_value_mem_gpu_,
+        sizeof(io->op_value_mem_[0]) * (wdl_ ? 3 : 1) * batchSize,
+        cudaMemcpyDeviceToHost, download_stream));
+
+    if (wdl_) {
+#if CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS
+      ReportCUDAErrors(cudaEventRecordWithFlags(
+          io->wdl_download_done_event_, download_stream,
+          capture ? cudaEventRecordExternal : 0));
+#endif
     }
 
     if (moves_left_) {
       // Moves left head
       network_[l++]->Eval(batchSize, spare1, flow, nullptr, scratch_mem,
                           scratch_size_, nullptr, cublas,
-                          stream);  // moves conv or embedding
+                          compute_stream);  // moves conv or embedding
 
       network_[l++]->Eval(batchSize, spare2, spare1, nullptr, scratch_mem,
                           scratch_size_, nullptr, cublas,
-                          stream);  // moves FC1
+                          compute_stream);  // moves FC1
 
       // Moves left FC2
-      if (fp16) {
-        // TODO: consider fusing the bias-add of FC2 with format conversion.
-        network_[l++]->Eval(batchSize, spare1, spare2, nullptr, scratch_mem,
-                            scratch_size_, nullptr, cublas, stream);
-        copyTypeConverted(opMov, (half*)(spare1), batchSize, stream);
-      } else {
-        network_[l++]->Eval(batchSize, (DataType*)opMov, spare2, nullptr,
-                            scratch_mem, scratch_size_, nullptr, cublas,
-                            stream);
+      network_[l++]->Eval(batchSize, (DataType*)opMov, spare2, nullptr,
+                          scratch_mem, scratch_size_, nullptr, cublas,
+                          compute_stream);
+      if (!multi_stream_) {
+#if CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS
+        ReportCUDAErrors(
+            cudaEventRecordWithFlags(compute_ordering_event_, compute_stream,
+                                     capture ? cudaEventRecordExternal : 0));
+#endif
       }
+      ReportCUDAErrors(
+          cudaEventRecord(io->moves_left_done_event_, compute_stream));
+      ReportCUDAErrors(
+          cudaStreamWaitEvent(download_stream, io->moves_left_done_event_, 0));
+      ReportCUDAErrors(
+          cudaMemcpyAsync(io->op_moves_left_mem_, io->op_moves_left_mem_gpu_,
+                          sizeof(io->op_moves_left_mem_[0]) * batchSize,
+                          cudaMemcpyDeviceToHost, download_stream));
     }
-
-    if (multi_stream_) {
-      ReportCUDAErrors(cudaStreamSynchronize(stream));
-    } else {
-      ReportCUDAErrors(cudaDeviceSynchronize());
-      // The next thread can start using the GPU now.
-      lock_.unlock();
+#if CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS
+    ReportCUDAErrors(
+        cudaEventRecordWithFlags(io->download_done_event_, download_stream,
+                                 capture ? cudaEventRecordExternal : 0));
+#else
+    if (!capture) {
+      ReportCUDAErrors(
+          cudaEventRecord(io->download_done_event_, download_stream));
     }
+#endif
+  }
 
+  void finishEval(InputsOutputs<DataType>* io, int batchSize) {
+#if !CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS
+    ReportCUDAErrors(cudaEventSynchronize(io->download_done_event_));
+#endif
     if (wdl_) {
+#if CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS
+      ReportCUDAErrors(cudaEventSynchronize(io->wdl_download_done_event_));
+#endif
       // Value softmax done cpu side.
       for (int i = 0; i < batchSize; i++) {
-        float w = io->op_value_mem_[3 * i + 0];
-        float d = io->op_value_mem_[3 * i + 1];
-        float l = io->op_value_mem_[3 * i + 2];
+        float* wdl = sizeof(io->op_value_mem_[0]) == sizeof(float)
+                         ? (float*)io->op_value_mem_
+                         : io->wdl_cpu_softmax_.get();
+        float w = FromType(io->op_value_mem_[3 * i + 0]);
+        float d = FromType(io->op_value_mem_[3 * i + 1]);
+        float l = FromType(io->op_value_mem_[3 * i + 2]);
         float m = std::max({w, d, l});
         w = std::exp(w - m);
         d = std::exp(d - m);
@@ -863,12 +1005,14 @@ class CudaNetwork : public Network {
         float sum = w + d + l;
         w /= sum;
         l /= sum;
-        d = 1.0f - w - l;
-        io->op_value_mem_[3 * i + 0] = w;
-        io->op_value_mem_[3 * i + 1] = d;
-        io->op_value_mem_[3 * i + 2] = l;
+        d /= sum;
+        wdl[2 * i + 0] = w - l;
+        wdl[2 * i + 1] = d;
       }
     }
+#if CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS
+    ReportCUDAErrors(cudaEventSynchronize(io->download_done_event_));
+#endif
   }
 
   ~CudaNetwork() {
@@ -880,7 +1024,11 @@ class CudaNetwork : public Network {
       if (offset_pointers_) ReportCUDAErrors(cudaFree(offset_pointers_));
       if (head_offset_pointers_)
         ReportCUDAErrors(cudaFree(head_offset_pointers_));
-      cublasDestroy(cublas_);
+      ReportCUBLASErrors(cublasDestroy(cublas_));
+      ReportCUDAErrors(cudaStreamDestroy(compute_stream_));
+      ReportCUDAErrors(cudaStreamDestroy(upload_stream_));
+      ReportCUDAErrors(cudaStreamDestroy(download_stream_));
+      ReportCUDAErrors(cudaEventDestroy(compute_ordering_event_));
     }
   }
 
@@ -893,31 +1041,41 @@ class CudaNetwork : public Network {
     return 2 * sm_count_;
   }
 
+  int GetPreferredBatchStep() const override {
+    int preferred_split = 7;
+    while (sm_count_ % preferred_split != 0) preferred_split++;
+    return preferred_split;
+  }
+
   int GetThreads() const override { return 1 + multi_stream_; }
 
   std::unique_ptr<NetworkComputation> NewComputation() override {
     // Set correct gpu id for this computation (as it might have been called
     // from a different thread).
-    ReportCUDAErrors(cudaSetDevice(gpu_id_));
+    int device = -1;
+    ReportCUDAErrors(cudaGetDevice(&device));
+    if (device != gpu_id_) {
+      ReportCUDAErrors(cudaSetDevice(gpu_id_));
+    }
     return std::make_unique<CudaNetworkComputation<DataType>>(this, wdl_,
                                                               moves_left_);
   }
 
-  std::unique_ptr<InputsOutputs> GetInputsOutputs() {
+  std::unique_ptr<InputsOutputs<DataType>> GetInputsOutputs() {
     std::lock_guard<std::mutex> lock(inputs_outputs_lock_);
     if (free_inputs_outputs_.empty()) {
-      return std::make_unique<InputsOutputs>(
+      return std::make_unique<InputsOutputs<DataType>>(
           max_batch_size_, wdl_, moves_left_, tensor_mem_size_, scratch_size_,
           !has_tensor_cores_ && std::is_same<half, DataType>::value);
     } else {
-      std::unique_ptr<InputsOutputs> resource =
+      std::unique_ptr<InputsOutputs<DataType>> resource =
           std::move(free_inputs_outputs_.front());
       free_inputs_outputs_.pop_front();
       return resource;
     }
   }
 
-  void ReleaseInputsOutputs(std::unique_ptr<InputsOutputs> resource) {
+  void ReleaseInputsOutputs(std::unique_ptr<InputsOutputs<DataType>> resource) {
     std::lock_guard<std::mutex> lock(inputs_outputs_lock_);
     free_inputs_outputs_.push_back(std::move(resource));
   }
@@ -926,7 +1084,7 @@ class CudaNetwork : public Network {
   // This function invokes constructor just to please complier and silence
   // warning. Is never called (but compiler thinks that it could).
   void UglyFunctionToSilenceNvccWarning() {
-    InputsOutputs io(0, false, false, false);
+    InputsOutputs<DataType> io(0, false, false, false);
   }
 
  private:
@@ -936,6 +1094,7 @@ class CudaNetwork : public Network {
   int sm_count_;
   int max_batch_size_;
   int min_batch_size_;
+  bool enable_graph_capture_;
   bool wdl_;
   bool moves_left_;
   bool use_res_block_winograd_fuse_opt_;  // fuse operations inside the residual
@@ -972,11 +1131,15 @@ class CudaNetwork : public Network {
   bool has_tensor_cores_;
 
   // not used when multi-steam is enabled
+  cudaStream_t compute_stream_ = nullptr;
+  cudaStream_t upload_stream_ = nullptr;
+  cudaStream_t download_stream_ = nullptr;
+  cudaEvent_t compute_ordering_event_ = nullptr;
   cublasHandle_t cublas_;
   DataType* tensor_mem_[3];
 
   mutable std::mutex inputs_outputs_lock_;
-  std::list<std::unique_ptr<InputsOutputs>> free_inputs_outputs_;
+  std::list<std::unique_ptr<InputsOutputs<DataType>>> free_inputs_outputs_;
 
   void showInfo() const {
     int version;
@@ -997,9 +1160,12 @@ class CudaNetwork : public Network {
       major = CUDART_VERSION / 1000;
       minor = (CUDART_VERSION - major * 1000) / 10;
       pl = CUDART_VERSION - major * 1000 - minor * 10;
-      CERR << "WARNING: CUDA Runtime version mismatch, was compiled with "
-              "version "
-           << major << "." << minor << "." << pl;
+      // After cuda 11, newer version with same major is OK.
+      if (major < 11 || (major != version / 1000) || version < CUDART_VERSION) {
+        CERR << "WARNING: CUDA Runtime version mismatch, was compiled with "
+                "version "
+             << major << "." << minor << "." << pl;
+      }
     }
     cudaDriverGetVersion(&version);
     major = version / 1000;
@@ -1012,11 +1178,27 @@ class CudaNetwork : public Network {
     }
   }
 
-  void showDeviceInfo(const cudaDeviceProp& deviceProp) const {
+  void showDeviceInfo(const cudaDeviceProp& deviceProp,
+                      [[maybe_unused]] int deviceId) const {
     CERR << "GPU: " << deviceProp.name;
     CERR << "GPU memory: " << deviceProp.totalGlobalMem / std::pow(2.0f, 30)
          << " Gb";
-    CERR << "GPU clock frequency: " << deviceProp.clockRate / 1e3f << " MHz";
+    // Get clock rate
+    float clockRateMHz;
+#if CUDART_VERSION >= 13000
+    int clockRatekHz;
+    cudaError_t err =
+        cudaDeviceGetAttribute(&clockRatekHz, cudaDevAttrClockRate, deviceId);
+    if (err != cudaSuccess) {
+      CERR << "Error getting clock rate: " << cudaGetErrorString(err);
+      clockRateMHz = 0.0f;  // Fallback value
+    } else {
+      clockRateMHz = clockRatekHz / 1e3f;
+    }
+#else
+    clockRateMHz = deviceProp.clockRate / 1e3f;
+#endif
+    CERR << "GPU clock frequency: " << clockRateMHz << " MHz";
     CERR << "GPU compute capability: " << deviceProp.major << "."
          << deviceProp.minor;
     CERR << "L2 cache capacity: " << deviceProp.l2CacheSize;
@@ -1040,9 +1222,40 @@ CudaNetworkComputation<DataType>::~CudaNetworkComputation() {
   network_->ReleaseInputsOutputs(std::move(inputs_outputs_));
 }
 
+template <typename DataType>
+void CudaNetworkComputation<DataType>::CaptureGraph(
+    std::unique_lock<std::mutex>&& lock) {
+  if (!network_->GetGraphCaptureEnabled()) return;
+  if (!CudaGraphCapture<DataType>::EnsureEnoughFreeMemory()) {
+    static std::once_flag flag;
+    std::call_once(flag, []() {
+      CERR << "WARNING: Not enough GPU memory to capture CUDA graphs.";
+    });
+    return;
+  }
+  auto capture = network_->BeginCapture(*inputs_outputs_);
+  network_->forwardEval(inputs_outputs_.get(), GetBatchSize(), true);
+  capture.EndCapture();
+  if (lock.owns_lock()) lock.unlock();
+  inputs_outputs_->cuda_graphs_[GetBatchSize() - 1] = capture;
+}
+
 template <typename DataType>
 void CudaNetworkComputation<DataType>::ComputeBlocking() {
-  network_->forwardEval(inputs_outputs_.get(), GetBatchSize());
+  LCTRACE_FUNCTION_SCOPE;
+  assert(GetBatchSize() >= 1);
+  if (inputs_outputs_->cuda_graphs_[GetBatchSize() - 1]) {
+    std::unique_lock<std::mutex> lock = network_->LockEval();
+    network_->GraphLaunch(inputs_outputs_.get(), GetBatchSize());
+  } else {
+    std::unique_lock<std::mutex> lock = network_->LockEval();
+#if !CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS
+    network_->UploadInputs(inputs_outputs_.get(), GetBatchSize());
+#endif
+    network_->forwardEval(inputs_outputs_.get(), GetBatchSize());
+    CaptureGraph(std::move(lock));
+  }
+  network_->finishEval(inputs_outputs_.get(), GetBatchSize());
 }
 
 template <typename DataType>
diff --git a/src/neural/cuda/network_cudnn.cc b/src/neural/backends/cuda/network_cudnn.cc
similarity index 77%
rename from src/neural/cuda/network_cudnn.cc
rename to src/neural/backends/cuda/network_cudnn.cc
index d5b54decdc..edf7b592e6 100644
--- a/src/neural/cuda/network_cudnn.cc
+++ b/src/neural/backends/cuda/network_cudnn.cc
@@ -26,7 +26,6 @@
 */
 #include <algorithm>
 #include <cassert>
-#include <functional>
 #include <list>
 #include <memory>
 #include <mutex>
@@ -37,11 +36,10 @@
 #include "layers.h"
 #include "neural/factory.h"
 #include "neural/network_legacy.h"
-#include "neural/shared/activation.h"
-#include "neural/shared/attention_policy_map.h"
-#include "neural/shared/policy_map.h"
-#include "utils/bititer.h"
+#include "neural/tables/attention_policy_map.h"
+#include "neural/tables/policy_map.h"
 #include "utils/exception.h"
+#include "utils/fp16_utils.h"
 
 // #define DEBUG_RAW_NPS
 
@@ -100,11 +98,10 @@ class CudnnNetworkComputation : public NetworkComputation {
     const auto iter_val =
         &inputs_outputs_->input_val_mem_[batch_size_ * kInputPlanes];
 
-    int i = 0;
-    for (const auto& plane : input) {
+    for (int i = 0; i < kInputPlanes; i++) {
+      const auto& plane = input[i];
       iter_mask[i] = plane.mask;
-      iter_val[i] = plane.value;
-      i++;
+      ToType(iter_val[i], plane.value);
     }
 
     batch_size_++;
@@ -112,41 +109,47 @@ class CudnnNetworkComputation : public NetworkComputation {
 
   void ComputeBlocking() override;
 
+  void CaptureGraph(std::unique_lock<std::mutex>&& lock = {});
+
   int GetBatchSize() const override { return batch_size_; }
 
   float GetQVal(int sample) const override {
     if (wdl_) {
-      auto w = inputs_outputs_->op_value_mem_[3 * sample + 0];
-      auto l = inputs_outputs_->op_value_mem_[3 * sample + 2];
-      return w - l;
-    } else {
-      return inputs_outputs_->op_value_mem_[sample];
+      const float* wdl =
+          sizeof(inputs_outputs_->op_value_mem_[0]) == sizeof(float)
+              ? (float*)inputs_outputs_->op_value_mem_
+              : inputs_outputs_->wdl_cpu_softmax_.get();
+      return wdl[2 * sample];
     }
+    return FromType(inputs_outputs_->op_value_mem_[sample]);
   }
 
   float GetDVal(int sample) const override {
     if (wdl_) {
-      auto d = inputs_outputs_->op_value_mem_[3 * sample + 1];
-      return d;
-    } else {
-      return 0.0f;
+      const float* wdl =
+          sizeof(inputs_outputs_->op_value_mem_[0]) == sizeof(float)
+              ? (float*)inputs_outputs_->op_value_mem_
+              : inputs_outputs_->wdl_cpu_softmax_.get();
+      return wdl[2 * sample + 1];
     }
+    return 0.0f;
   }
 
   float GetPVal(int sample, int move_id) const override {
-    return inputs_outputs_->op_policy_mem_[sample * kNumOutputPolicy + move_id];
+    return FromType(
+        inputs_outputs_->op_policy_mem_[sample * kNumOutputPolicy + move_id]);
   }
 
   float GetMVal(int sample) const override {
     if (moves_left_) {
-      return inputs_outputs_->op_moves_left_mem_[sample];
+      return FromType(inputs_outputs_->op_moves_left_mem_[sample]);
     }
     return 0.0f;
   }
 
  private:
   // Memory holding inputs, outputs.
-  std::unique_ptr<InputsOutputs> inputs_outputs_;
+  std::unique_ptr<InputsOutputs<DataType>> inputs_outputs_;
   int batch_size_;
   bool wdl_;
   bool moves_left_;
@@ -163,6 +166,7 @@ class CudnnNetwork : public Network {
                       file.format().network_format().moves_left()} {
     MultiHeadWeights weights(file.weights());
     gpu_id_ = options.GetOrDefault<int>("gpu", 0);
+    enable_graph_capture_ = options.GetOrDefault<bool>("graph_capture", true);
 
     conv_policy_ = file.format().network_format().policy() ==
                    pblczero::NetworkFormat::POLICY_CONVOLUTION;
@@ -190,7 +194,7 @@ class CudnnNetwork : public Network {
 
     cudaDeviceProp deviceProp = {};
     cudaGetDeviceProperties(&deviceProp, gpu_id_);
-    showDeviceInfo(deviceProp);
+    showDeviceInfo(deviceProp, gpu_id_);
 
     // Select GPU to run on (for *the current* thread).
     ReportCUDAErrors(cudaSetDevice(gpu_id_));
@@ -228,8 +232,19 @@ class CudnnNetwork : public Network {
       }
 
       // Override if forced from backend option
-      if (!options.IsDefault<bool>("nhwc")) nhwc_ = options.Get<bool>("nhwc");
+      if (options.Exists<bool>("nhwc")) nhwc_ = options.Get<bool>("nhwc");
     }
+    ReportCUDAErrors(
+        cudaStreamCreateWithFlags(&compute_stream_, cudaStreamNonBlocking));
+    ReportCUDAErrors(
+        cudaStreamCreateWithFlags(&upload_stream_, cudaStreamNonBlocking));
+    ReportCUDAErrors(
+        cudaStreamCreateWithFlags(&download_stream_, cudaStreamNonBlocking));
+    ReportCUDAErrors(cudaEventCreateWithFlags(&compute_ordering_event_,
+                                              cudaEventDisableTiming));
+
+    ReportCUBLASErrors(cublasSetStream(cublas_, compute_stream_));
+    ReportCUDNNErrors(cudnnSetStream(cudnn_, compute_stream_));
 
     if (hasTensorCores)
       ReportCUBLASErrors(cublasSetMathMode(
@@ -289,7 +304,7 @@ class CudnnNetwork : public Network {
     }
 
     const bool custom_winograd_override =
-        !options.IsDefault<bool>("custom_winograd");
+        options.Exists<bool>("custom_winograd");
 
     if (!custom_winograd_override && use_custom_winograd_ &&
         transformed_residual_weight_size > 0.5 * deviceProp.totalGlobalMem) {
@@ -321,7 +336,7 @@ class CudnnNetwork : public Network {
         use_res_block_winograd_fuse_opt_ = true;
       }
       // Override if set in backend-opts.
-      if (!options.IsDefault<bool>("res_block_fusing")) {
+      if (options.Exists<bool>("res_block_fusing")) {
         use_res_block_winograd_fuse_opt_ =
             options.Get<bool>("res_block_fusing");
       }
@@ -528,7 +543,7 @@ class CudnnNetwork : public Network {
       if (attn_policy_) {
         auto AttentionPolicy = std::make_unique<AttentionPolicyHead<DataType>>(
             getLastLayer(), head, scratch_mem_, false, ACTIVATION_SELU,
-            max_batch_size_);
+            max_batch_size_, use_gemm_ex);
         network_.emplace_back(std::move(AttentionPolicy));
 
         auto policymap = std::make_unique<PolicyMapLayer<DataType>>(
@@ -587,8 +602,7 @@ class CudnnNetwork : public Network {
       auto FCVal1 = std::make_unique<FCLayer<DataType>>(
           getLastLayer(), head.ip1_val_b.size(), 1, 1, true,
           mish_net ? ACTIVATION_MISH : ACTIVATION_RELU);
-      FCVal1->LoadWeights(&head.ip1_val_w[0], &head.ip1_val_b[0],
-                          scratch_mem_);
+      FCVal1->LoadWeights(&head.ip1_val_w[0], &head.ip1_val_b[0], scratch_mem_);
       network_.emplace_back(std::move(FCVal1));
 
       wdl_ = file.format().network_format().value() ==
@@ -598,8 +612,7 @@ class CudnnNetwork : public Network {
       auto FCVal2 = std::make_unique<FCLayer<DataType>>(
           getLastLayer(), head.ip2_val_b.size(), 1, 1, true,
           fc2_tanh ? ACTIVATION_TANH : ACTIVATION_NONE);
-      FCVal2->LoadWeights(&head.ip2_val_w[0], &head.ip2_val_b[0],
-                          scratch_mem_);
+      FCVal2->LoadWeights(&head.ip2_val_w[0], &head.ip2_val_b[0], scratch_mem_);
       network_.emplace_back(std::move(FCVal2));
     }
     value_out_ = getLastLayer();
@@ -665,45 +678,94 @@ class CudnnNetwork : public Network {
     CERR << "allocated " << 3 * maxSize
          << " bytes of GPU memory to run the network";
 #endif
+
+    // pre-allocate cuda graphs for search threads
+    auto allocateCudaGraphs = [&] {
+      CudnnNetworkComputation<DataType> comp(this, wdl_, moves_left_);
+      comp.AddInput(InputPlanes{(size_t)kNumInputPlanes});
+      // Make sure cublas is initialized in this thread.
+      comp.ComputeBlocking();
+      for (int i = 0; i < GetMiniBatchSize(); i++) {
+        comp.AddInput(InputPlanes{(size_t)kNumInputPlanes});
+        auto lock = LockEval();
+        comp.CaptureGraph(std::move(lock));
+      }
+    };
+    std::thread t2(allocateCudaGraphs);
+    allocateCudaGraphs();
+    t2.join();
   }
 
-  void forwardEval(InputsOutputs* io, int batchSize) {
+  std::unique_lock<std::mutex> LockEval() {
+    return std::unique_lock<std::mutex>{lock_};
+  }
+
+  bool GetGraphCaptureEnabled() const { return enable_graph_capture_; }
+
+  CudaGraphCapture<DataType> BeginCapture(InputsOutputs<DataType>& io) {
+    return {io, compute_stream_, download_stream_};
+  }
+
+  void UploadInputs(InputsOutputs<DataType>* io, int batchSize) {
+    ReportCUDAErrors(
+        cudaMemcpyAsync(io->input_masks_mem_gpu_, io->input_masks_mem_,
+                        batchSize * kInputPlanes * sizeof(uint64_t),
+                        cudaMemcpyHostToDevice, upload_stream_));
+    ReportCUDAErrors(cudaMemcpyAsync(
+        io->input_val_mem_gpu_, io->input_val_mem_,
+        batchSize * kInputPlanes * sizeof(io->input_val_mem_[0]),
+        cudaMemcpyHostToDevice, upload_stream_));
+    ReportCUDAErrors(cudaEventRecord(io->upload_done_event_, upload_stream_));
+    ReportCUDAErrors(
+        cudaStreamWaitEvent(compute_stream_, io->upload_done_event_, 0));
+  }
+
+  void GraphLaunch(InputsOutputs<DataType>* io, int batchSize) {
+    UploadInputs(io, batchSize);
+
+    // cudaGraphUpload was added in CUDA 11.1
+#if CUDART_VERSION >= 11010
+    // Make sure graph has completed upload before launching it.
+    ReportCUDAErrors(cudaStreamSynchronize(io->exec_stream_));
+#endif
+
+    io->cuda_graphs_[batchSize - 1].Launch(compute_stream_);
+    ReportCUDAErrors(
+        cudaEventRecord(io->download_done_event_, compute_stream_));
+  }
+
+  void forwardEval(InputsOutputs<DataType>* io, int batchSize,
+                   bool capture = false) {
     // It is safe to evaluate larger than the batchSize
     // as all buffers are designed to handle max_batch_size
     // and the extra invalid results are never read.
     if (batchSize < min_batch_size_) batchSize = min_batch_size_;
-    std::unique_lock<std::mutex> lock(lock_);
 
 #ifdef DEBUG_RAW_NPS
     auto t_start = std::chrono::high_resolution_clock::now();
 #endif
 
     // TODO: consider supporting multi-stream path for cudnn backend too.
-    cudaStream_t stream = 0;  // default stream
+    cudaStream_t compute_stream = compute_stream_;
+    cudaStream_t download_stream = download_stream_;
 
     // Expand packed planes to full planes.
-    uint64_t* ipDataMasks = io->input_masks_mem_gpu_;
-    float* ipDataValues = io->input_val_mem_gpu_;
+    const uint64_t* ipDataMasks = io->input_masks_mem_gpu_;
+    const auto* ipDataValues = io->input_val_mem_gpu_;
 
-    bool fp16 = std::is_same<half, DataType>::value;
-    if (fp16) {
-      if (nhwc_)
-        expandPlanes_Fp16_NHWC((half*)(tensor_mem_[0]), ipDataMasks,
-                               ipDataValues, batchSize * kInputPlanes, stream);
-      else
-        expandPlanes_Fp16_NCHW((half*)(tensor_mem_[0]), ipDataMasks,
-                               ipDataValues, batchSize * kInputPlanes, stream);
-    } else {
-      expandPlanes_Fp32_NCHW((float*)(tensor_mem_[0]), ipDataMasks,
-                             ipDataValues, batchSize * kInputPlanes, stream);
-    }
+    if (nhwc_)
+      expandPlanes_NHWC(tensor_mem_[0], ipDataMasks, ipDataValues,
+                        batchSize * kInputPlanes, compute_stream);
+    else
+      expandPlanes_NCHW(tensor_mem_[0], ipDataMasks, ipDataValues,
+                        batchSize * kInputPlanes, compute_stream);
 
     // debug code example
     // dumpTensor(tensor_mem_[0], 1024, "After expand Planes", fp16);
 
-    float* opPol = io->op_policy_mem_gpu_;
-    float* opVal = io->op_value_mem_gpu_;
-    float* opMov = io->op_moves_left_mem_gpu_;
+    auto* opPol = io->op_policy_mem_gpu_;
+    auto* opVal = io->op_value_mem_gpu_;
+    auto* opMov = io->op_moves_left_mem_gpu_;
 
     int l = 0;
     // Input.
@@ -711,40 +773,40 @@ class CudnnNetwork : public Network {
         batchSize,
         use_res_block_winograd_fuse_opt_ ? tensor_mem_[1] : tensor_mem_[2],
         tensor_mem_[0], nullptr, scratch_mem_, scratch_size_, cudnn_, cublas_,
-        stream);  // input conv
+        compute_stream);  // input conv
 
     // Residual block.
     for (int block = 0; block < numBlocks_; block++) {
       if (use_res_block_winograd_fuse_opt_) {
         network_[l++]->Eval(batchSize, tensor_mem_[2], tensor_mem_[1], nullptr,
                             scratch_mem_, scratch_size_, cudnn_, cublas_,
-                            stream);  // block
+                            compute_stream);  // block
       } else {
         network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[2], nullptr,
                             scratch_mem_, scratch_size_, cudnn_, cublas_,
-                            stream);  // conv1
+                            compute_stream);  // conv1
 
         if (use_custom_winograd_) {
           network_[l++]->Eval(batchSize, tensor_mem_[2], tensor_mem_[0],
                               tensor_mem_[2], scratch_mem_, scratch_size_,
-                              cudnn_, cublas_, stream);  // conv2
+                              cudnn_, cublas_, compute_stream);  // conv2
         } else {
           // For SE Resnet, skip connection is added after SE (and bias is added
           // as part of SE).
           if (has_se_) {
             network_[l++]->Eval(batchSize, tensor_mem_[1], tensor_mem_[0],
                                 nullptr, scratch_mem_, scratch_size_, cudnn_,
-                                cublas_, stream);  // conv2
+                                cublas_, compute_stream);  // conv2
           } else {
             network_[l++]->Eval(batchSize, tensor_mem_[2], tensor_mem_[0],
                                 tensor_mem_[2], scratch_mem_, scratch_size_,
-                                cudnn_, cublas_, stream);  // conv2
+                                cudnn_, cublas_, compute_stream);  // conv2
           }
 
           if (has_se_) {
             network_[l++]->Eval(batchSize, tensor_mem_[2], tensor_mem_[1],
                                 tensor_mem_[2], scratch_mem_, scratch_size_,
-                                cudnn_, cublas_, stream);  // SE layer
+                                cudnn_, cublas_, compute_stream);  // SE layer
           }
         }
       }
@@ -754,125 +816,110 @@ class CudnnNetwork : public Network {
     if (attn_policy_) {
       network_[l++]->Eval(
           batchSize, tensor_mem_[0], tensor_mem_[2], tensor_mem_[1],
-          scratch_mem_, scratch_size_, nullptr, cublas_, stream,
+          scratch_mem_, scratch_size_, nullptr, cublas_, compute_stream,
           &head_offset_pointers_);  // Entire Attention policy head except for
                                     // the policy map
-      if (fp16) {
-        network_[l++]->Eval(batchSize, tensor_mem_[1], tensor_mem_[0], nullptr,
-                            scratch_mem_, scratch_size_, nullptr, cublas_,
-                            stream);  // policy map layer
-        copyTypeConverted(opPol, (half*)(tensor_mem_[1]),
-                          batchSize * kNumOutputPolicy,
-                          stream);  // POLICY output
-      } else {
-        network_[l++]->Eval(batchSize, (DataType*)opPol, tensor_mem_[0],
-                            nullptr, scratch_mem_, scratch_size_, nullptr,
-                            cublas_, stream);  // policy map layer
-                                               // POLICY output
-      }
+      network_[l++]->Eval(batchSize, (DataType*)opPol, tensor_mem_[0], nullptr,
+                          scratch_mem_, scratch_size_, nullptr, cublas_,
+                          compute_stream);  // policy map layer
+                                            // POLICY output
 
     } else if (conv_policy_) {
       network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[2], nullptr,
                           scratch_mem_, scratch_size_, cudnn_, cublas_,
-                          stream);  // policy conv1
+                          compute_stream);  // policy conv1
 
       network_[l++]->Eval(batchSize, tensor_mem_[1], tensor_mem_[0], nullptr,
                           scratch_mem_, scratch_size_, cudnn_, cublas_,
-                          stream);  // policy conv2
+                          compute_stream);  // policy conv2
 
-      if (fp16) {
-        network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[1], nullptr,
-                            scratch_mem_, scratch_size_, cudnn_, cublas_,
-                            stream);  // policy map layer
-        copyTypeConverted(opPol, (half*)(tensor_mem_[0]),
-                          batchSize * kNumOutputPolicy,
-                          stream);  // POLICY output
-      } else {
-        network_[l++]->Eval(batchSize, (DataType*)opPol, tensor_mem_[1],
-                            nullptr, scratch_mem_, scratch_size_, cudnn_,
-                            cublas_,
-                            stream);  // policy map layer  // POLICY output
-      }
+      network_[l++]->Eval(
+          batchSize, (DataType*)opPol, tensor_mem_[1], nullptr, scratch_mem_,
+          scratch_size_, cudnn_, cublas_,
+          compute_stream);  // policy map layer  // POLICY output
     } else {
       network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[2], nullptr,
                           scratch_mem_, scratch_size_, cudnn_, cublas_,
-                          stream);  // pol conv
-
-      if (fp16) {
-        network_[l++]->Eval(batchSize, tensor_mem_[1], tensor_mem_[0], nullptr,
-                            scratch_mem_, scratch_size_, cudnn_, cublas_,
-                            stream);  // pol FC
+                          compute_stream);  // pol conv
 
-        copyTypeConverted(opPol, (half*)(tensor_mem_[1]),
-                          batchSize * kNumOutputPolicy, stream);  // POLICY
-      } else {
-        network_[l++]->Eval(batchSize, (DataType*)opPol, tensor_mem_[0],
-                            nullptr, scratch_mem_, scratch_size_, cudnn_,
-                            cublas_, stream);  // pol FC  // POLICY
-      }
+      network_[l++]->Eval(batchSize, (DataType*)opPol, tensor_mem_[0], nullptr,
+                          scratch_mem_, scratch_size_, cudnn_, cublas_,
+                          compute_stream);  // pol FC  // POLICY
     }
 
+    ReportCUDAErrors(cudaEventRecord(io->policy_done_event_, compute_stream));
+    ReportCUDAErrors(
+        cudaStreamWaitEvent(download_stream, io->policy_done_event_, 0));
+
     // Copy policy output from device memory to host memory.
     ReportCUDAErrors(cudaMemcpyAsync(
         io->op_policy_mem_, io->op_policy_mem_gpu_,
-        sizeof(float) * kNumOutputPolicy * batchSize, cudaMemcpyDeviceToHost));
+        sizeof(io->op_policy_mem_[0]) * kNumOutputPolicy * batchSize,
+        cudaMemcpyDeviceToHost, download_stream));
 
     // value head
     network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[2], nullptr,
                         scratch_mem_, scratch_size_, cudnn_, cublas_,
-                        stream);  // value conv
+                        compute_stream);  // value conv
 
     network_[l++]->Eval(batchSize, tensor_mem_[1], tensor_mem_[0], nullptr,
                         scratch_mem_, scratch_size_, cudnn_, cublas_,
-                        stream);  // value FC1
+                        compute_stream);  // value FC1
 
-    if (fp16) {
-      // TODO: consider fusing the bias-add of FC2 with format conversion.
-      network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[1], nullptr,
-                          scratch_mem_, scratch_size_, cudnn_, cublas_,
-                          stream);  // value FC2
-      copyTypeConverted(opVal, (half*)(tensor_mem_[0]),
-                        wdl_ ? 3 * batchSize : batchSize, stream);  // VALUE
-    } else {
-      network_[l++]->Eval(batchSize, (DataType*)opVal, tensor_mem_[1], nullptr,
-                          scratch_mem_, scratch_size_, cudnn_, cublas_,
-                          stream);  // value FC2    // VALUE
-    }
+    network_[l++]->Eval(batchSize, (DataType*)opVal, tensor_mem_[1], nullptr,
+                        scratch_mem_, scratch_size_, cudnn_, cublas_,
+                        compute_stream);  // value FC2    // VALUE
+
+    ReportCUDAErrors(cudaEventRecord(io->value_done_event_, compute_stream));
+    ReportCUDAErrors(
+        cudaStreamWaitEvent(download_stream, io->value_done_event_, 0));
+    ReportCUDAErrors(cudaMemcpyAsync(
+        io->op_value_mem_, io->op_value_mem_gpu_,
+        sizeof(io->op_value_mem_[0]) * (wdl_ ? 3 : 1) * batchSize,
+        cudaMemcpyDeviceToHost, download_stream));
 
     if (moves_left_) {
       // Moves left head
       network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[2], nullptr,
                           scratch_mem_, scratch_size_, cudnn_, cublas_,
-                          stream);  // moves conv
+                          compute_stream);  // moves conv
 
       network_[l++]->Eval(batchSize, tensor_mem_[1], tensor_mem_[0], nullptr,
                           scratch_mem_, scratch_size_, cudnn_, cublas_,
-                          stream);  // moves FC1
+                          compute_stream);  // moves FC1
 
       // Moves left FC2
-      if (fp16) {
-        // TODO: consider fusing the bias-add of FC2 with format conversion.
-        network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[1], nullptr,
-                            scratch_mem_, scratch_size_, cudnn_, cublas_,
-                            stream);
-        copyTypeConverted(opMov, (half*)(tensor_mem_[0]), batchSize, stream);
-      } else {
-        network_[l++]->Eval(batchSize, (DataType*)opMov, tensor_mem_[1],
-                            nullptr, scratch_mem_, scratch_size_, cudnn_,
-                            cublas_, stream);
-      }
+      network_[l++]->Eval(batchSize, (DataType*)opMov, tensor_mem_[1], nullptr,
+                          scratch_mem_, scratch_size_, cudnn_, cublas_,
+                          compute_stream);
+
+      ReportCUDAErrors(
+          cudaEventRecord(io->moves_left_done_event_, compute_stream));
+      ReportCUDAErrors(
+          cudaStreamWaitEvent(download_stream, io->moves_left_done_event_, 0));
+      ReportCUDAErrors(
+          cudaMemcpyAsync(io->op_moves_left_mem_, io->op_moves_left_mem_gpu_,
+                          sizeof(io->op_moves_left_mem_[0]) * batchSize,
+                          cudaMemcpyDeviceToHost, download_stream));
     }
 
-    ReportCUDAErrors(cudaDeviceSynchronize());
-    // The next thread can start using the GPU now.
-    lock.unlock();
+    if (!capture) {
+      ReportCUDAErrors(
+          cudaEventRecord(io->download_done_event_, download_stream));
+    }
+  }
 
+  void finishEval(InputsOutputs<DataType>* io, int batchSize) {
+    ReportCUDAErrors(cudaEventSynchronize(io->download_done_event_));
     if (wdl_) {
       // Value softmax done cpu side.
       for (int i = 0; i < batchSize; i++) {
-        float w = io->op_value_mem_[3 * i + 0];
-        float d = io->op_value_mem_[3 * i + 1];
-        float l = io->op_value_mem_[3 * i + 2];
+        float* wdl = sizeof(io->op_value_mem_[0]) == sizeof(float)
+                         ? (float*)io->op_value_mem_
+                         : io->wdl_cpu_softmax_.get();
+        float w = FromType(io->op_value_mem_[3 * i + 0]);
+        float d = FromType(io->op_value_mem_[3 * i + 1]);
+        float l = FromType(io->op_value_mem_[3 * i + 2]);
         float m = std::max({w, d, l});
         w = std::exp(w - m);
         d = std::exp(d - m);
@@ -880,10 +927,9 @@ class CudnnNetwork : public Network {
         float sum = w + d + l;
         w /= sum;
         l /= sum;
-        d = 1.0f - w - l;
-        io->op_value_mem_[3 * i + 0] = w;
-        io->op_value_mem_[3 * i + 1] = d;
-        io->op_value_mem_[3 * i + 2] = l;
+        d /= sum;
+        wdl[2 * i + 0] = w - l;
+        wdl[2 * i + 1] = d;
       }
     }
 
@@ -922,6 +968,9 @@ class CudnnNetwork : public Network {
       ReportCUDAErrors(cudaFree(head_offset_pointers_));
     cudnnDestroy(cudnn_);
     cublasDestroy(cublas_);
+    ReportCUDAErrors(cudaStreamDestroy(compute_stream_));
+    ReportCUDAErrors(cudaStreamDestroy(upload_stream_));
+    ReportCUDAErrors(cudaStreamDestroy(download_stream_));
   }
 
   const NetworkCapabilities& GetCapabilities() const override {
@@ -931,25 +980,29 @@ class CudnnNetwork : public Network {
   std::unique_ptr<NetworkComputation> NewComputation() override {
     // Set correct gpu id for this computation (as it might have been called
     // from a different thread).
-    ReportCUDAErrors(cudaSetDevice(gpu_id_));
+    int device = -1;
+    ReportCUDAErrors(cudaGetDevice(&device));
+    if (device != gpu_id_) {
+      ReportCUDAErrors(cudaSetDevice(gpu_id_));
+    }
     return std::make_unique<CudnnNetworkComputation<DataType>>(this, wdl_,
                                                                moves_left_);
   }
 
-  std::unique_ptr<InputsOutputs> GetInputsOutputs() {
+  std::unique_ptr<InputsOutputs<DataType>> GetInputsOutputs() {
     std::lock_guard<std::mutex> lock(inputs_outputs_lock_);
     if (free_inputs_outputs_.empty()) {
-      return std::make_unique<InputsOutputs>(max_batch_size_, wdl_,
-                                             moves_left_);
+      return std::make_unique<InputsOutputs<DataType>>(max_batch_size_, wdl_,
+                                                       moves_left_);
     } else {
-      std::unique_ptr<InputsOutputs> resource =
+      std::unique_ptr<InputsOutputs<DataType>> resource =
           std::move(free_inputs_outputs_.front());
       free_inputs_outputs_.pop_front();
       return resource;
     }
   }
 
-  void ReleaseInputsOutputs(std::unique_ptr<InputsOutputs> resource) {
+  void ReleaseInputsOutputs(std::unique_ptr<InputsOutputs<DataType>> resource) {
     std::lock_guard<std::mutex> lock(inputs_outputs_lock_);
     free_inputs_outputs_.push_back(std::move(resource));
   }
@@ -957,7 +1010,9 @@ class CudnnNetwork : public Network {
   // Apparently nvcc doesn't see constructor invocations through make_unique.
   // This function invokes constructor just to please complier and silence
   // warning. Is never called (but compiler thinks that it could).
-  void UglyFunctionToSilenceNvccWarning() { InputsOutputs io(0, false, false, false); }
+  void UglyFunctionToSilenceNvccWarning() {
+    InputsOutputs<DataType> io(0, false, false, false);
+  }
 
  private:
   const NetworkCapabilities capabilities_;
@@ -966,6 +1021,7 @@ class CudnnNetwork : public Network {
   int gpu_id_;
   int max_batch_size_;
   int min_batch_size_;
+  bool enable_graph_capture_;
   bool wdl_;
   bool moves_left_;
 
@@ -1000,7 +1056,12 @@ class CudnnNetwork : public Network {
   size_t scratch_size_;
 
   mutable std::mutex inputs_outputs_lock_;
-  std::list<std::unique_ptr<InputsOutputs>> free_inputs_outputs_;
+  std::list<std::unique_ptr<InputsOutputs<DataType>>> free_inputs_outputs_;
+
+  cudaStream_t compute_stream_ = nullptr;
+  cudaStream_t upload_stream_ = nullptr;
+  cudaStream_t download_stream_ = nullptr;
+  cudaEvent_t compute_ordering_event_ = nullptr;
 
   void showInfo() const {
     int version;
@@ -1021,16 +1082,20 @@ class CudnnNetwork : public Network {
       major = CUDART_VERSION / 1000;
       minor = (CUDART_VERSION - major * 1000) / 10;
       pl = CUDART_VERSION - major * 1000 - minor * 10;
-      CERR << "WARNING: CUDA Runtime version mismatch, was compiled with "
-              "version "
-           << major << "." << minor << "." << pl;
+      // After cuda 11, newer version with same major is OK.
+      if (major < 11 || (major != version / 1000) || version < CUDART_VERSION) {
+        CERR << "WARNING: CUDA Runtime version mismatch, was compiled with "
+                "version "
+             << major << "." << minor << "." << pl;
+      }
     }
     version = (int)cudnnGetVersion();
     major = version / 1000;
     minor = (version - major * 1000) / 100;
     pl = version - major * 1000 - minor * 100;
     CERR << "Cudnn version: " << major << "." << minor << "." << pl;
-    if (version != CUDNN_VERSION) {
+    // Assuming CUDNN > 7.
+    if (major != CUDNN_MAJOR || minor < CUDNN_MINOR) {
       CERR << "WARNING: CUDNN Runtime version mismatch, was compiled with "
               "version "
            << CUDNN_MAJOR << "." << CUDNN_MINOR << "." << CUDNN_PATCHLEVEL;
@@ -1046,11 +1111,27 @@ class CudnnNetwork : public Network {
     }
   }
 
-  void showDeviceInfo(const cudaDeviceProp& deviceProp) const {
+  void showDeviceInfo(const cudaDeviceProp& deviceProp,
+                      [[maybe_unused]] int deviceId) const {
     CERR << "GPU: " << deviceProp.name;
     CERR << "GPU memory: " << deviceProp.totalGlobalMem / std::pow(2.0f, 30)
          << " GiB";
-    CERR << "GPU clock frequency: " << deviceProp.clockRate / 1e3f << " MHz";
+    // Get clock rate
+    float clockRateMHz;
+#if CUDART_VERSION >= 13000
+    int clockRatekHz;
+    cudaError_t err =
+        cudaDeviceGetAttribute(&clockRatekHz, cudaDevAttrClockRate, deviceId);
+    if (err != cudaSuccess) {
+      CERR << "Error getting clock rate: " << cudaGetErrorString(err);
+      clockRateMHz = 0.0f;  // Fallback value
+    } else {
+      clockRateMHz = clockRatekHz / 1e3f;
+    }
+#else
+    clockRateMHz = deviceProp.clockRate / 1e3f;
+#endif
+    CERR << "GPU clock frequency: " << clockRateMHz << " MHz";
     CERR << "GPU compute capability: " << deviceProp.major << "."
          << deviceProp.minor;
 
@@ -1079,9 +1160,37 @@ CudnnNetworkComputation<DataType>::~CudnnNetworkComputation() {
   network_->ReleaseInputsOutputs(std::move(inputs_outputs_));
 }
 
+template <typename DataType>
+void CudnnNetworkComputation<DataType>::CaptureGraph(
+    std::unique_lock<std::mutex>&& lock) {
+  if (!network_->GetGraphCaptureEnabled()) return;
+  if (!CudaGraphCapture<DataType>::EnsureEnoughFreeMemory()) {
+    static std::once_flag flag;
+    std::call_once(flag, []() {
+      CERR << "WARNING: Not enough GPU memory to capture CUDA graphs.";
+    });
+    return;
+  }
+  CudaGraphCapture capture = network_->BeginCapture(*inputs_outputs_);
+  network_->forwardEval(inputs_outputs_.get(), GetBatchSize(), true);
+  capture.EndCapture();
+  if (lock.owns_lock()) lock.unlock();
+  inputs_outputs_->cuda_graphs_[GetBatchSize() - 1] = capture;
+}
+
 template <typename DataType>
 void CudnnNetworkComputation<DataType>::ComputeBlocking() {
-  network_->forwardEval(inputs_outputs_.get(), GetBatchSize());
+  assert(GetBatchSize() >= 1);
+  if (inputs_outputs_->cuda_graphs_[GetBatchSize() - 1]) {
+    std::unique_lock<std::mutex> lock = network_->LockEval();
+    network_->GraphLaunch(inputs_outputs_.get(), GetBatchSize());
+  } else {
+    std::unique_lock<std::mutex> lock = network_->LockEval();
+    network_->UploadInputs(inputs_outputs_.get(), GetBatchSize());
+    network_->forwardEval(inputs_outputs_.get(), GetBatchSize());
+    CaptureGraph(std::move(lock));
+  }
+  network_->finishEval(inputs_outputs_.get(), GetBatchSize());
 }
 
 template <typename DataType>
diff --git a/src/neural/cuda/readme.txt b/src/neural/backends/cuda/readme.txt
similarity index 100%
rename from src/neural/cuda/readme.txt
rename to src/neural/backends/cuda/readme.txt
diff --git a/src/neural/cuda/winograd_helper.inc b/src/neural/backends/cuda/winograd_helper.inc
similarity index 99%
rename from src/neural/cuda/winograd_helper.inc
rename to src/neural/backends/cuda/winograd_helper.inc
index 72e9828bb9..749181eee4 100644
--- a/src/neural/cuda/winograd_helper.inc
+++ b/src/neural/backends/cuda/winograd_helper.inc
@@ -843,14 +843,15 @@ __global__ __launch_bounds__(
 }
 
 template <typename T>
-void FilterTransform(int N, int C, T* transformedFilter, const T* filter) {
+void FilterTransform(int N, int C, T* transformedFilter, const T* filter,
+                     cudaStream_t stream) {
   // Each thread processes entire filter block (input 3x3 elements -> output 6x6
   // elements)
   const int kBlockSize = 64;
   const int kBlocks = DivUp(N * C, kBlockSize);
 
-  filterTransform_kernel<<<kBlocks, kBlockSize>>>(N, C, N * C,
-                                                  transformedFilter, filter);
+  filterTransform_kernel<<<kBlocks, kBlockSize, 0, stream>>>(
+      N, C, N * C, transformedFilter, filter);
 
   ReportCUDAErrors(cudaGetLastError());
 }
diff --git a/src/neural/dx/MetaCommand.h b/src/neural/backends/dx/MetaCommand.h
similarity index 100%
rename from src/neural/dx/MetaCommand.h
rename to src/neural/backends/dx/MetaCommand.h
diff --git a/src/neural/dx/dx_common.h b/src/neural/backends/dx/dx_common.h
similarity index 100%
rename from src/neural/dx/dx_common.h
rename to src/neural/backends/dx/dx_common.h
diff --git a/src/neural/dx/layers_dx.cc b/src/neural/backends/dx/layers_dx.cc
similarity index 100%
rename from src/neural/dx/layers_dx.cc
rename to src/neural/backends/dx/layers_dx.cc
diff --git a/src/neural/dx/layers_dx.h b/src/neural/backends/dx/layers_dx.h
similarity index 100%
rename from src/neural/dx/layers_dx.h
rename to src/neural/backends/dx/layers_dx.h
diff --git a/src/neural/dx/network_dx.cc b/src/neural/backends/dx/network_dx.cc
similarity index 99%
rename from src/neural/dx/network_dx.cc
rename to src/neural/backends/dx/network_dx.cc
index 1c34a6fb2d..1000b88422 100644
--- a/src/neural/dx/network_dx.cc
+++ b/src/neural/backends/dx/network_dx.cc
@@ -35,7 +35,7 @@
 #include <vector>
 
 #include "layers_dx.h"
-#include "neural/shared/policy_map.h"
+#include "neural/tables/policy_map.h"
 #include "shader_wrapper.h"
 #include "utils/bititer.h"
 #include "utils/exception.h"
diff --git a/src/neural/dx/network_dx.h b/src/neural/backends/dx/network_dx.h
similarity index 100%
rename from src/neural/dx/network_dx.h
rename to src/neural/backends/dx/network_dx.h
diff --git a/src/neural/dx/shader_wrapper.cc b/src/neural/backends/dx/shader_wrapper.cc
similarity index 100%
rename from src/neural/dx/shader_wrapper.cc
rename to src/neural/backends/dx/shader_wrapper.cc
diff --git a/src/neural/dx/shader_wrapper.h b/src/neural/backends/dx/shader_wrapper.h
similarity index 100%
rename from src/neural/dx/shader_wrapper.h
rename to src/neural/backends/dx/shader_wrapper.h
diff --git a/src/neural/dx/shaders/AddVectors.hlsl b/src/neural/backends/dx/shaders/AddVectors.hlsl
similarity index 100%
rename from src/neural/dx/shaders/AddVectors.hlsl
rename to src/neural/backends/dx/shaders/AddVectors.hlsl
diff --git a/src/neural/dx/shaders/Conv1x1.hlsl b/src/neural/backends/dx/shaders/Conv1x1.hlsl
similarity index 100%
rename from src/neural/dx/shaders/Conv1x1.hlsl
rename to src/neural/backends/dx/shaders/Conv1x1.hlsl
diff --git a/src/neural/dx/shaders/ExpandPlanes.hlsl b/src/neural/backends/dx/shaders/ExpandPlanes.hlsl
similarity index 100%
rename from src/neural/dx/shaders/ExpandPlanes.hlsl
rename to src/neural/backends/dx/shaders/ExpandPlanes.hlsl
diff --git a/src/neural/dx/shaders/Gemm.hlsl b/src/neural/backends/dx/shaders/Gemm.hlsl
similarity index 100%
rename from src/neural/dx/shaders/Gemm.hlsl
rename to src/neural/backends/dx/shaders/Gemm.hlsl
diff --git a/src/neural/dx/shaders/PolicyMap.hlsl b/src/neural/backends/dx/shaders/PolicyMap.hlsl
similarity index 100%
rename from src/neural/dx/shaders/PolicyMap.hlsl
rename to src/neural/backends/dx/shaders/PolicyMap.hlsl
diff --git a/src/neural/dx/shaders/SE.hlsl b/src/neural/backends/dx/shaders/SE.hlsl
similarity index 100%
rename from src/neural/dx/shaders/SE.hlsl
rename to src/neural/backends/dx/shaders/SE.hlsl
diff --git a/src/neural/dx/shaders/WinogradCommon.h b/src/neural/backends/dx/shaders/WinogradCommon.h
similarity index 100%
rename from src/neural/dx/shaders/WinogradCommon.h
rename to src/neural/backends/dx/shaders/WinogradCommon.h
diff --git a/src/neural/dx/shaders/WinogradTransform.hlsl b/src/neural/backends/dx/shaders/WinogradTransform.hlsl
similarity index 100%
rename from src/neural/dx/shaders/WinogradTransform.hlsl
rename to src/neural/backends/dx/shaders/WinogradTransform.hlsl
diff --git a/src/neural/dx/shaders/WinogradTransformSE.hlsl b/src/neural/backends/dx/shaders/WinogradTransformSE.hlsl
similarity index 100%
rename from src/neural/dx/shaders/WinogradTransformSE.hlsl
rename to src/neural/backends/dx/shaders/WinogradTransformSE.hlsl
diff --git a/src/neural/dx/shaders/dxc_helper.py b/src/neural/backends/dx/shaders/dxc_helper.py
similarity index 100%
rename from src/neural/dx/shaders/dxc_helper.py
rename to src/neural/backends/dx/shaders/dxc_helper.py
diff --git a/src/neural/dx/shaders/meson.build b/src/neural/backends/dx/shaders/meson.build
similarity index 100%
rename from src/neural/dx/shaders/meson.build
rename to src/neural/backends/dx/shaders/meson.build
diff --git a/src/neural/dx/shaders/shader_shared.h b/src/neural/backends/dx/shaders/shader_shared.h
similarity index 100%
rename from src/neural/dx/shaders/shader_shared.h
rename to src/neural/backends/dx/shaders/shader_shared.h
diff --git a/src/neural/dx/shaders/shaders.h b/src/neural/backends/dx/shaders/shaders.h
similarity index 100%
rename from src/neural/dx/shaders/shaders.h
rename to src/neural/backends/dx/shaders/shaders.h
diff --git a/src/neural/metal/metal_common.h b/src/neural/backends/metal/metal_common.h
similarity index 80%
rename from src/neural/metal/metal_common.h
rename to src/neural/backends/metal/metal_common.h
index a42c00dcac..0c76d7395b 100644
--- a/src/neural/metal/metal_common.h
+++ b/src/neural/backends/metal/metal_common.h
@@ -36,14 +36,13 @@ static int kInputPlanes = 112;
 struct InputsOutputs {
   InputsOutputs(int maxBatchSize, bool wdl, bool moves_left, bool conv_policy,
                 bool attn_policy) {
-    input_masks_mem_.reserve(maxBatchSize * kInputPlanes);
-    input_val_mem_.reserve(maxBatchSize * kInputPlanes);
-    input_val_mem_expanded_.reserve(maxBatchSize * kInputPlanes * 64);
-    op_policy_mem_.reserve(maxBatchSize * kNumOutputPolicy);
-    op_value_mem_.reserve(maxBatchSize * (wdl ? 3 : 1));
+    input_masks_mem_.resize(maxBatchSize * kInputPlanes);
+    input_val_mem_.resize(maxBatchSize * kInputPlanes);
+    op_policy_mem_.resize(maxBatchSize * kNumOutputPolicy);
+    op_value_mem_.resize(maxBatchSize * (wdl ? 3 : 1));
 
     if (moves_left) {
-      op_moves_left_mem_.reserve(maxBatchSize);
+      op_moves_left_mem_.resize(maxBatchSize);
     };
 
     /**
@@ -53,16 +52,15 @@ struct InputsOutputs {
      * Remove this op_policy_raw_mem_ memory allocation when bug is fixed.
      */
     if (attn_policy) {
-      op_policy_raw_mem_.reserve(maxBatchSize * (64 * 64 + 8 * 24));
+      op_policy_raw_mem_.resize(maxBatchSize * (64 * 64 + 8 * 24));
     } else if (conv_policy) {
-      op_policy_raw_mem_.reserve(maxBatchSize * 73 * 64);
+      op_policy_raw_mem_.resize(maxBatchSize * 73 * 64);
     }
   }
   ~InputsOutputs() {}
 
   std::vector<uint64_t> input_masks_mem_;
   std::vector<float> input_val_mem_;
-  std::vector<float> input_val_mem_expanded_;
   std::vector<float> op_policy_mem_;
   std::vector<float> op_value_mem_;
   std::vector<float> op_moves_left_mem_;
diff --git a/src/neural/metal/mps/MetalNetworkBuilder.h b/src/neural/backends/metal/mps/MetalNetworkBuilder.h
similarity index 96%
rename from src/neural/metal/mps/MetalNetworkBuilder.h
rename to src/neural/backends/metal/mps/MetalNetworkBuilder.h
index 74ddd6bcaa..869e014005 100644
--- a/src/neural/metal/mps/MetalNetworkBuilder.h
+++ b/src/neural/backends/metal/mps/MetalNetworkBuilder.h
@@ -51,7 +51,7 @@ class MetalNetworkBuilder {
              Activations& activations, std::string& policy_head,
              std::string& value_head);
 
-  void forwardEval(float* inputs, int batchSize,
+  void forwardEval(float* values, uint64_t* masks, int batchSize,
                    std::vector<float*> output_mems);
 
  private:
diff --git a/src/neural/metal/mps/MetalNetworkBuilder.mm b/src/neural/backends/metal/mps/MetalNetworkBuilder.mm
similarity index 96%
rename from src/neural/metal/mps/MetalNetworkBuilder.mm
rename to src/neural/backends/metal/mps/MetalNetworkBuilder.mm
index 4c3d5c9ccc..7791d13d85 100644
--- a/src/neural/metal/mps/MetalNetworkBuilder.mm
+++ b/src/neural/backends/metal/mps/MetalNetworkBuilder.mm
@@ -26,7 +26,7 @@ Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
  */
 
 #import "neural/network_legacy.h"
-#import "neural/shared/attention_policy_map.h"
+#import "neural/tables/attention_policy_map.h"
 #import "MetalNetworkBuilder.h"
 #import "NetworkGraph.h"
 
@@ -36,13 +36,12 @@ Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
 MetalNetworkBuilder::MetalNetworkBuilder(void){}
 MetalNetworkBuilder::~MetalNetworkBuilder(void){}
 
-//void MetalNetworkBuilder::init(void* weights, void* options)
 std::string MetalNetworkBuilder::init(int gpu_id)
 {
     // All metal devices.
     NSArray<id<MTLDevice>> * devices = MTLCopyAllDevices();
 
-    if ([devices count] <= gpu_id) {
+    if ((NSUInteger)gpu_id >= [devices count]) {
         // No GPU device matching ID.
         [NSException raise:@"Could not find device" format:@"Could not find a GPU or CPU compute device with specified id"];
         return "";
@@ -68,13 +67,17 @@ Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
     NSString * policyHead = [NSString stringWithUTF8String:policy_head.c_str()];
     NSString * valueHead = [NSString stringWithUTF8String:value_head.c_str()];
 
-    // 0. Input placeholder.
-    // @todo - placeholder can be made directly as NHWC to avoid transposes.
+    // 0. Input value and mask placeholders.
     MPSGraphTensor * layer = [graph inputPlaceholderWithInputChannels:kInputPlanes
-                                                               height:8
-                                                                width:8
                                                                 label:@"inputs"];
 
+    MPSGraphTensor * maskTensor = [graph maskPlaceholderWithInputChannels:kInputPlanes
+                                                                    label:@"inputs/mask"];
+
+    layer = [graph expandInputTensorWithMask:maskTensor
+                                       input:layer
+                                       label:@"inputs/expand"];
+
     const NSUInteger kernelSize = 3;
     const bool isPeDenseEmbedding = embedding == InputEmbedding::INPUT_EMBEDDING_PE_DENSE;
 
@@ -302,11 +305,11 @@ Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
     }
 }
 
-void MetalNetworkBuilder::forwardEval(float * inputs, int batchSize, std::vector<float *> output_mems)
+void MetalNetworkBuilder::forwardEval(float * inputs, uint64_t * masks, int batchSize, std::vector<float *> output_mems)
 {
     @autoreleasepool {
         Lc0NetworkGraph * graph = [Lc0NetworkGraph getGraphAt:[NSNumber numberWithInt:this->gpu_id]];
-        [graph runInferenceWithBatchSize:batchSize inputs:inputs outputs:&output_mems[0]];
+        [graph runInferenceWithBatchSize:batchSize inputs:inputs masks:masks outputs:&output_mems[0]];
     }
 }
 
diff --git a/src/neural/metal/mps/NetworkGraph.h b/src/neural/backends/metal/mps/NetworkGraph.h
similarity index 92%
rename from src/neural/metal/mps/NetworkGraph.h
rename to src/neural/backends/metal/mps/NetworkGraph.h
index 2664b68c7d..dfc163cc48 100644
--- a/src/neural/metal/mps/NetworkGraph.h
+++ b/src/neural/backends/metal/mps/NetworkGraph.h
@@ -50,12 +50,13 @@ static MPSImageFeatureChannelFormat fcFormat = MPSImageFeatureChannelFormatFloat
 
     // Input tensor and tensor data placeholders.
     MPSGraphTensor * _inputTensor;
+    MPSGraphTensor * _maskTensor;
 
     // Variables to track results of graph inference.
     NSArray<MPSGraphTensor *> * _resultTensors;
     NSArray<MPSGraphTensor *> * _targetTensors;
     NSMutableDictionary<NSNumber *, MPSGraphTensorDataDictionary *> * _resultDataDicts;
-    NSMutableDictionary<NSString *, NSObject *> * _readVariables;
+    NSMutableDictionary<NSString *, MPSGraphTensor *> * _readVariables;
 
     // Variables for triple buffering
     dispatch_semaphore_t _doubleBufferingSemaphore;
@@ -72,10 +73,20 @@ static MPSImageFeatureChannelFormat fcFormat = MPSImageFeatureChannelFormatFloat
 -(nonnull instancetype) initWithDevice:(id<MTLDevice> __nonnull)device;
 
 -(nonnull MPSGraphTensor *) inputPlaceholderWithInputChannels:(NSUInteger)channels
-                                                       height:(NSUInteger)height
-                                                        width:(NSUInteger)width
                                                         label:(NSString * __nullable)label;
 
+-(nonnull MPSGraphTensor *) maskPlaceholderWithInputChannels:(NSUInteger)channels
+                                                       label:(NSString * __nullable)label;
+
+-(nonnull MPSGraphTensor *) expandInputTensorWithMask:(MPSGraphTensor * __nonnull)maskTensor
+                                                input:(MPSGraphTensor * __nonnull)inputTensor
+                                                label:(NSString * __nonnull)label;
+
+- (nonnull MPSGraphTensor *) broadcastByStackingTensor:(MPSGraphTensor * __nonnull)input
+                                                  axis:(NSInteger)axis
+                                                 times:(NSUInteger)times
+                                                  name:(NSString * __nonnull)name;
+
 -(nonnull MPSGraphTensor *) addConvolutionBlockWithParent:(MPSGraphTensor * __nonnull)parent
                                            outputChannels:(NSUInteger)outputChannels
                                                kernelSize:(NSUInteger)kernelSize
@@ -199,9 +210,11 @@ static MPSImageFeatureChannelFormat fcFormat = MPSImageFeatureChannelFormatFloat
 
 -(nonnull NSArray<MPSGraphTensor *> *) runInferenceWithBatchSize:(NSUInteger)batchSize
                                                           inputs:(float * __nonnull)inputs
+                                                           masks:(uint64_t * __nonnull)masks
                                                          outputs:(float * __nonnull * __nonnull)outputBuffers;
 
 -(nonnull MPSCommandBuffer *) runCommandSubBatchWithInputs:(float * __nonnull)inputs
+                                                     masks:(uint64_t * __nonnull)masks
                                                   subBatch:(NSUInteger)subBatch
                                               subBatchSize:(NSUInteger)subBatchSize;
 
diff --git a/src/neural/metal/mps/NetworkGraph.mm b/src/neural/backends/metal/mps/NetworkGraph.mm
similarity index 87%
rename from src/neural/metal/mps/NetworkGraph.mm
rename to src/neural/backends/metal/mps/NetworkGraph.mm
index 0befa256e6..322308e67b 100644
--- a/src/neural/metal/mps/NetworkGraph.mm
+++ b/src/neural/backends/metal/mps/NetworkGraph.mm
@@ -25,9 +25,11 @@ Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
   Program grant you additional permission to convey the resulting work.
 */
 
+#import <vector>
 #import "neural/network_legacy.h"
+#import "neural/tables/attention_policy_map.h"
+#import "neural/tables/policy_map.h"
 #import "NetworkGraph.h"
-#import <vector>
 
 static MPSGraphConvolution2DOpDescriptor * __nonnull convolution2DDescriptor = [MPSGraphConvolution2DOpDescriptor descriptorWithStrideInX:1
                                                                                                                                 strideInY:1
@@ -66,13 +68,12 @@ -(NSUInteger) size {
 -(NSUInteger) sizeOfDimensions:(NSArray<NSNumber *> *)dimensions {
     NSUInteger size = 1;
     for (NSNumber * dim in dimensions) {
-        if ([dim intValue] < [self.shape count])
-            size *= [self.shape[[dim intValue]] intValue];
+        if ((NSUInteger)[dim intValue] < [self.shape count])
+            size *= [self.shape[(NSUInteger)[dim intValue]] intValue];
     }
     return size;
 }
 
-
 -(NSUInteger) sizeOfDimensionsFrom:(NSNumber *)dimension {
     NSUInteger size = 1;
     for (NSUInteger dim = [dimension intValue]; dim < [self.shape count]; dim++) {
@@ -137,6 +138,7 @@ -(nonnull instancetype) initWithDevice:(id<MTLDevice> __nonnull)device
 
 -(nonnull NSArray<MPSGraphTensor *> *) runInferenceWithBatchSize:(NSUInteger)batchSize
                                                           inputs:(float * __nonnull)inputs
+                                                           masks:(uint64_t * __nonnull)masks
                                                          outputs:(float * __nonnull * __nonnull)outputBuffers
 {
     // Calculate number of sub-batches to split across GPU command buffers for parallel execution.
@@ -144,18 +146,20 @@ -(nonnull instancetype) initWithDevice:(id<MTLDevice> __nonnull)device
     NSUInteger splits = (batchSize + kMinSubBatchSize + 1) / kMinSubBatchSize;
     if (splits > kMaxInflightBuffers) splits = kMaxInflightBuffers;
     NSUInteger subBatchSize = batchSize / splits;
-    NSUInteger inputDataLength = subBatchSize * [_inputTensor sizeOfDimensions:@[@1, @2, @3]];
+    NSUInteger inputDataLength = subBatchSize * [_inputTensor sizeOfDimensionsFrom:@1];
 
     // Split batchSize into smaller sub-batches and run using double-buffering.
     NSUInteger subBatch = 0;
     MPSCommandBuffer * commandBuffer;
     for (subBatch = 0; subBatch < splits - 1; subBatch++) {
         commandBuffer = [self runCommandSubBatchWithInputs:inputs + subBatch * inputDataLength
+                                                     masks:masks + subBatch * inputDataLength
                                                   subBatch:subBatch
                                               subBatchSize:subBatchSize];
     }
     // Last sub-batch may be smaller or larger than others.
     MPSCommandBuffer * latestCommandBuffer = [self runCommandSubBatchWithInputs:inputs + subBatch * inputDataLength
+                                                                          masks:masks + subBatch * inputDataLength
                                                                        subBatch:subBatch
                                                                    subBatchSize:batchSize - subBatch * subBatchSize];
 
@@ -169,6 +173,7 @@ -(nonnull instancetype) initWithDevice:(id<MTLDevice> __nonnull)device
 }
 
 -(nonnull MPSCommandBuffer *) runCommandSubBatchWithInputs:(float * __nonnull)inputs
+                                                     masks:(uint64_t * __nonnull)masks
                                                   subBatch:(NSUInteger)subBatch
                                               subBatchSize:(NSUInteger)subBatchSize
 {
@@ -178,7 +183,7 @@ -(nonnull MPSCommandBuffer *) runCommandSubBatchWithInputs:(float * __nonnull)in
     // Create command buffer for this sub-batch.
     MPSCommandBuffer * commandBuffer = [MPSCommandBuffer commandBufferFromCommandQueue:_queue];
 
-    MPSShape * shape = @[@(subBatchSize), _inputTensor.shape[1], _inputTensor.shape[2], _inputTensor.shape[3]];
+    MPSShape * shape = @[@(subBatchSize), _inputTensor.shape[1], _inputTensor.shape[2]];
 
     NSData * inputData = [NSData dataWithBytesNoCopy:inputs
                                               length:subBatchSize * sizeof(float)
@@ -189,17 +194,32 @@ -(nonnull MPSCommandBuffer *) runCommandSubBatchWithInputs:(float * __nonnull)in
                                                                                 shape:shape
                                                                              dataType:_inputTensor.dataType];
 
+    NSData * maskData = [NSData dataWithBytesNoCopy:masks
+                                             length:subBatchSize * sizeof(uint64_t)
+                                       freeWhenDone:NO];
+
+    MPSGraphTensorData * inputMaskData = [[MPSGraphTensorData alloc] initWithDevice:_device
+                                                                               data:maskData
+                                                                              shape:shape
+                                                                           dataType:MPSDataTypeUInt64];
+
+    NSDictionary * feeds = @{_inputTensor : inputTensorData, _maskTensor : inputMaskData};
+
     // Create execution descriptor with block to update results for each iteration.
     MPSGraphExecutionDescriptor * executionDescriptor = [[MPSGraphExecutionDescriptor alloc] init];
-    executionDescriptor.completionHandler = ^(MPSGraphTensorDataDictionary * resultDictionary, NSError * error) {
-        _resultDataDicts[@(subBatch)] = resultDictionary;
+    executionDescriptor.completionHandler = ^(MPSGraphTensorDataDictionary * resultDictionary, NSError * _Nullable error) {
+        if (error) {
+            NSLog(@"Error occurred during execution: %@", error);
+        } else {
+            _resultDataDicts[@(subBatch)] = resultDictionary;
+        }
 
         // Release double buffering semaphore for the next training iteration to be encoded.
         dispatch_semaphore_signal(_doubleBufferingSemaphore);
     };
 
     [self encodeToCommandBuffer:commandBuffer
-                          feeds:@{_inputTensor : inputTensorData}
+                          feeds:feeds
                   targetTensors:_targetTensors
                targetOperations:nil
             executionDescriptor:executionDescriptor];
@@ -226,9 +246,6 @@ -(void) copyResultsToBuffers:(float * __nonnull * __nonnull)outputBuffers
 
 -(void) setResultTensors:(NSArray<MPSGraphTensor *> * __nonnull)results
 {
-    // Okay to remove nulls from the read variables.
-    [_readVariables removeObjectsForKeys:[_readVariables allKeysForObject:[NSNull null]]];
-
     // Set the results we're interested in.
     _resultTensors = results;
 
@@ -238,16 +255,110 @@ -(void) setResultTensors:(NSArray<MPSGraphTensor *> * __nonnull)results
 }
 
 -(nonnull MPSGraphTensor *) inputPlaceholderWithInputChannels:(NSUInteger)channels
-                                                       height:(NSUInteger)height
-                                                        width:(NSUInteger)width
                                                         label:(NSString * __nullable)label
 {
-    // Create a placeholder tensor that can hold the specified number of sub-batches.
-    _inputTensor = [self placeholderWithShape:@[@(-1), @(channels), @(height), @(width)] name:label];
-
+    _inputTensor = [self placeholderWithShape:@[@(-1), @(channels), @1]
+                                     dataType:MPSDataTypeFloat32
+                                         name:label];
     return _inputTensor;
 }
 
+-(nonnull MPSGraphTensor *) maskPlaceholderWithInputChannels:(NSUInteger)channels
+                                                       label:(NSString * __nullable)label
+{
+    _maskTensor = [self placeholderWithShape:@[@(-1), @(channels), @1]
+                                    dataType:MPSDataTypeUInt64
+                                        name:label];
+    return _maskTensor;
+}
+
+-(nonnull MPSGraphTensor *) expandInputTensorWithMask:(MPSGraphTensor * __nonnull)maskTensor
+                                                input:(MPSGraphTensor * __nonnull)valueTensor
+                                                label:(NSString * __nonnull)label
+{
+    // 64 values to form the bitboard indices.
+    uint64_t bitIndices[64];
+    for (int i = 0; i < 64; i++) {
+        bitIndices[i] = 1ULL << i;
+    }
+    NSData * bitIndicesData = [NSData dataWithBytesNoCopy:bitIndices
+                                                   length:64 * sizeof(uint64_t)
+                                             freeWhenDone:NO];
+
+    MPSGraphTensor * bitIndicesTensor = [self constantWithData:bitIndicesData
+                                                         shape:@[@1, @1, @64]
+                                                      dataType:MPSDataTypeUInt64];
+
+    // Broadcast mask and bit index tensors to [N,C,64]
+    maskTensor = [self broadcastByStackingTensor:maskTensor
+                                            axis:3
+                                           times:64
+                                            name:[NSString stringWithFormat:@"%@/mask/broadcast", label]];
+
+    MPSGraphTensor * expandedMaskTensor;
+    if (@available(macOS 13.0, *)) {
+        // Expand the bitmap using the masks and values.
+        expandedMaskTensor = [self bitwiseANDWithPrimaryTensor:maskTensor
+                                               secondaryTensor:bitIndicesTensor
+                                                          name:[NSString stringWithFormat:@"%@/mask/bitwise_and", label]];
+
+        MPSGraphTensor * zeroTensor = [self constantWithScalar:0.0
+                                                         shape:@[@1]
+                                                      dataType:MPSDataTypeUInt64];
+
+        expandedMaskTensor = [self notEqualWithPrimaryTensor:expandedMaskTensor
+                                             secondaryTensor:zeroTensor
+                                                        name:[NSString stringWithFormat:@"%@/zero_equals", label]];
+    } else {
+        // Alternative method: bitwise ops not available in earlier macos versions, so using integer division and modulo.
+        // Divide by the bit index, which is also a power of 2, to shift the desired bit to position 0.
+        expandedMaskTensor = [self divisionWithPrimaryTensor:maskTensor
+                                             secondaryTensor:bitIndicesTensor
+                                                        name:[NSString stringWithFormat:@"%@/mask/divide", label]];
+
+        // Take modulo 2 to extract the least significant bit
+        MPSGraphTensor * twoTensor = [self constantWithScalar:2.0
+                                                        shape:@[@1]
+                                                     dataType:MPSDataTypeUInt64];
+
+        expandedMaskTensor = [self moduloWithPrimaryTensor:expandedMaskTensor
+                                           secondaryTensor:twoTensor
+                                                      name:[NSString stringWithFormat:@"%@/mask/modulo", label]];
+    }
+
+    // Broadcast input tensor values to match the expanded dimensions.
+    valueTensor = [self broadcastByStackingTensor:valueTensor
+                                             axis:3
+                                            times:64
+                                             name:[NSString stringWithFormat:@"%@/input/broadcast", label]];
+
+    expandedMaskTensor = [self castTensor:expandedMaskTensor
+                                   toType:MPSDataTypeFloat32
+                                     name:[NSString stringWithFormat:@"%@/input/cast", label]];
+
+    // Final multiplication: value * mask
+    expandedMaskTensor = [self multiplicationWithPrimaryTensor:expandedMaskTensor
+                                               secondaryTensor:valueTensor
+                                                          name:[NSString stringWithFormat:@"%@/input/multiply", label]];
+
+    // Reshape to final output format [batch_size, kInputPlanes, 8, 8]
+    return [self reshapeTensor:expandedMaskTensor
+                     withShape:@[@(-1), valueTensor.shape[1], @8, @8]
+                          name:[NSString stringWithFormat:@"%@/input/reshape", label]];
+}
+
+- (nonnull MPSGraphTensor *) broadcastByStackingTensor:(MPSGraphTensor * __nonnull)input
+                                                  axis:(NSInteger)axis
+                                                 times:(NSUInteger)times
+                                                  name:(NSString * __nonnull)name
+{
+    NSMutableArray<MPSGraphTensor *> * stackedTensors = [NSMutableArray array];
+    for (NSUInteger i = 0; i < times; i++) {
+        [stackedTensors addObject:input];
+    }
+    return [self stackTensors:stackedTensors axis:axis name:name];
+}
+
 -(nonnull MPSGraphTensor *) addConvolutionBlockWithParent:(MPSGraphTensor * __nonnull)parent
                                            outputChannels:(NSUInteger)outputChannels
                                                kernelSize:(NSUInteger)kernelSize
@@ -471,23 +582,37 @@ -(nonnull MPSGraphTensor *) addSEUnitWithParent:(MPSGraphTensor * __nonnull)pare
 }
 
 -(nonnull MPSGraphTensor *) addPolicyMapLayerWithParent:(MPSGraphTensor * __nonnull)parent
-                                              policyMap:(uint32_t * __nonnull)policyMap
+                                              policyMap:(const short * __nonnull)policyMap
+                                                mapSize:(NSUInteger)mapSize
                                                   label:(NSString * __nonnull)label
 {
-    NSData * policyMapData = [NSData dataWithBytesNoCopy:policyMap
-                                                  length:kNumPolicyOutputs * sizeof(uint32_t)
-                                            freeWhenDone:NO];
+    if ([parent sizeOfDimensionsFrom:@1] < mapSize) {
+        [NSException raise:@"Invalid parent tensor shape"
+                    format:@"Parent tensor non-batch dimensions (%zu) is less than mapping tensor size of (%zu) for policy mapping.",
+                           [parent sizeOfDimensionsFrom:@1], mapSize];
+    }
 
-    MPSGraphTensor * mappingTensor = [self constantWithData:policyMapData
+    // The mapping is an array of 64x?? squares, where each square contains a number from -1 to 1857.
+    // The mapping is flattened to a 1D array of size 1858, where each index corresponds to a square
+    // that had a value != -1.
+    uint32_t mappingIndices[kNumPolicyOutputs];
+    for (NSUInteger i = 0; i < mapSize; i++) {
+        if (policyMap[i] == -1) continue;
+        mappingIndices[policyMap[i]] = i;
+    }
+
+    NSData * policyMapIndexData = [NSData dataWithBytesNoCopy:mappingIndices
+                                                       length:kNumPolicyOutputs * sizeof(uint32_t)
+                                                 freeWhenDone:NO];
+
+    MPSGraphTensor * indicesTensor = [self constantWithData:policyMapIndexData
                                                       shape:@[@(kNumPolicyOutputs)]
                                                    dataType:MPSDataTypeUInt32];
 
-    MPSGraphTensor * flatConvTensor = [self flatten2DTensor:parent
-                                                       axis:1
-                                                       name:[NSString stringWithFormat:@"%@/flatten", label]];
+    parent = [self flatten2DTensor:parent axis:1 name:[NSString stringWithFormat:@"%@/flatten", label]];
 
-    MPSGraphTensor * policyTensor = [self gatherWithUpdatesTensor:flatConvTensor
-                                                    indicesTensor:mappingTensor
+    MPSGraphTensor * policyTensor = [self gatherWithUpdatesTensor:parent
+                                                    indicesTensor:indicesTensor
                                                              axis:1
                                                   batchDimensions:0
                                                              name:[NSString stringWithFormat:@"%@/gather", label]];
@@ -506,7 +631,6 @@ -(nonnull MPSGraphTensor *) addEncoderLayerWithParent:(MPSGraphTensor * __nonnul
                                              normtype:(NSString * __nonnull)normtype
                                                 label:(NSString * __nonnull)label
 {
-    NSUInteger dModel = encoder.mha.q_b.size();
     MPSGraphTensor * mhaQ = [self addFullyConnectedLayerWithParent:parent
                                                     outputChannels:encoder.mha.q_b.size()
                                                            weights:&encoder.mha.q_w[0]
@@ -605,15 +729,16 @@ -(nonnull MPSGraphTensor *) addEncoderLayerWithParent:(MPSGraphTensor * __nonnul
                                                label:[NSString stringWithFormat:@"%@/ln2", label]];
     }
     else if ([normtype isEqual:@"rmsnorm"] || [normtype isEqual:@"skipfirst"]) {
-        enc = [self addRmsNormalizationWithParent:enc
-                            scaledSecondaryTensor:ffn
-                                           gammas:&encoder.ln2_gammas[0]
-                                            alpha:alpha
-                                            label:[NSString stringWithFormat:@"%@/ln1", label]];
+        return [self addRmsNormalizationWithParent:enc
+                             scaledSecondaryTensor:ffn
+                                            gammas:&encoder.ln2_gammas[0]
+                                             alpha:alpha
+                                             label:[NSString stringWithFormat:@"%@/ln1", label]];
     }
     else {
         [NSException raise:@"Invalid normalization type."
                     format:@"Invalid normalization type specified: %@", normtype];
+        return nil;
     }
 }
 
@@ -882,7 +1007,8 @@ -(nonnull MPSGraphTensor *) scaledQKMatmulWithQueries:(MPSGraphTensor * __nonnul
 
     qkMatmul = [self multiplicationWithPrimaryTensor:qkMatmul
                                      secondaryTensor:[self constantWithScalar:scale
-                                                                        shape:@[@1] dataType:qkMatmul.dataType]
+                                                                        shape:@[@1]
+                                                                     dataType:qkMatmul.dataType]
                                                 name:[NSString stringWithFormat:@"%@/scale", label]];
     return qkMatmul;
 }
@@ -944,6 +1070,14 @@ -(nonnull MPSGraphTensor *) attentionPolicyPromoMatmulConcatWithParent:(MPSGraph
 
     parent = [self reshapeTensor:parent withShape:@[@(-1), @64, @64] name:[NSString stringWithFormat:@"%@/parent_reshape", label]];
 
+    MPSGraphTensor * slice = [self sliceTensor:parent dimension:1 start:48 length:8 name:[NSString stringWithFormat:@"%@/slice_policy_1", label]];
+    slice = [self sliceTensor:slice dimension:2 start:56 length:8 name:[NSString stringWithFormat:@"%@/slice_policy_2", label]];
+    slice = [self reshapeTensor:slice withShape:@[@(-1), @64] name:[NSString stringWithFormat:@"%@/slice_reshape", label]];
+    slice = [self broadcastByStackingTensor:slice axis:2 times:3 name:[NSString stringWithFormat:@"%@/slice_broadcast", label]];
+    slice = [self transposeTensor:slice dimension:1 withDimension:2 name:[NSString stringWithFormat:@"%@/slice_transpose", label]];
+
+    promo = [self additionWithPrimaryTensor:promo secondaryTensor:slice name:[NSString stringWithFormat:@"%@/offset_add", label]];
+
     return [self concatTensor:parent withTensor:promo dimension:1 name:[NSString stringWithFormat:@"%@/concat", label]];
 }
 
@@ -1263,7 +1397,8 @@ -(nonnull MPSGraphTensor *) makePolicyHeadWithTensor:(MPSGraphTensor * __nonnull
                                            scale:1.0f / sqrt(policyDModel)
                                            label:[NSString stringWithFormat:@"%@/self_attention/kq", label]];
 
-        // 6. Slice last 8 keys (k[:, 56:, :]) and matmul with policy promotion weights, then concat to matmul_qk.
+        // 6. Slice last 8 keys (k[:, 48:56, 56:64]) and matmul with policy promotion weights,
+        //    add to promotion logits then concat to matmul_qk.
         policy = [self attentionPolicyPromoMatmulConcatWithParent:policy
                                                          withKeys:keys
                                                           weights:&head.ip4_pol_w[0]
@@ -1272,6 +1407,12 @@ -(nonnull MPSGraphTensor *) makePolicyHeadWithTensor:(MPSGraphTensor * __nonnull
                                                         sliceFrom:56
                                                       channelSize:policyDModel
                                                             label:[NSString stringWithFormat:@"%@/promo_logits", label]];
+
+        policy = [self addPolicyMapLayerWithParent:policy
+                                         policyMap:&lczero::kAttnPolicyMap[0]
+                                           mapSize:(64 * 64 + 8 * 24)
+                                             label:[NSString stringWithFormat:@"%@/policy_mapping", label]];
+
     }
     else if (convolutionPolicy) {
         if (attentionBody) {
@@ -1296,30 +1437,10 @@ -(nonnull MPSGraphTensor *) makePolicyHeadWithTensor:(MPSGraphTensor * __nonnull
                                                label:[NSString stringWithFormat:@"%@/conv2", label]];
 
 
-        /**
-         * @todo policy map implementation has bug in MPSGraph (GatherND not working in graph).
-         * Implementation of policy map to be done in CPU for now.
-         *
-         * Reinstate this section when bug is fixed. See comments below.
-         *
-         // [1858 -> HWC or CHW]
-         const bool HWC = false;
-         std::vector<uint32_t> policy_map(1858);
-         for (const auto& mapping : kConvPolicyMap) {
-         if (mapping == -1) continue;
-         const auto index = &mapping - kConvPolicyMap;
-         const auto displacement = index / 64;
-         const auto square = index % 64;
-         const auto row = square / 8;
-         const auto col = square % 8;
-         if (HWC) {
-         policy_map[mapping] = ((row * 8) + col) * 80 + displacement;
-         } else {
-         policy_map[mapping] = ((displacement * 8) + row) * 8 + col;
-         }
-         }
-         policy = builder_->makePolicyMapLayer(policy, &policy_map[0], "policy_map");
-         */
+        policy = [self addPolicyMapLayerWithParent:policy
+                                         policyMap:&lczero::kConvPolicyMap[0]
+                                           mapSize:(73 * 64)
+                                             label:[NSString stringWithFormat:@"%@/policy_mapping", label]];
     }
     else {
         if (attentionBody) {
@@ -1391,10 +1512,10 @@ -(nonnull MPSGraphTensor *) makeValueHeadWithTensor:(MPSGraphTensor * __nonnull)
 
     value = [self addFullyConnectedLayerWithParent:value
                                     outputChannels:head.ip2_val_b.size()
-                                            weights:&head.ip2_val_w[0]
+                                           weights:&head.ip2_val_w[0]
                                             biases:&head.ip2_val_b[0]
                                         activation:wdl ? @"softmax" : @"tanh"
-                                                label:[NSString stringWithFormat:@"%@/fc2", label]];
+                                             label:[NSString stringWithFormat:@"%@/fc2", label]];
 
     return value;
 }
diff --git a/src/neural/metal/network_metal.cc b/src/neural/backends/metal/network_metal.cc
similarity index 71%
rename from src/neural/metal/network_metal.cc
rename to src/neural/backends/metal/network_metal.cc
index f3d8d7eda2..46f29459b5 100644
--- a/src/neural/metal/network_metal.cc
+++ b/src/neural/backends/metal/network_metal.cc
@@ -37,8 +37,8 @@
 #include "mps/MetalNetworkBuilder.h"
 #include "neural/factory.h"
 #include "neural/network_legacy.h"
-#include "neural/shared/attention_policy_map.h"
-#include "neural/shared/policy_map.h"
+#include "neural/tables/attention_policy_map.h"
+#include "neural/tables/policy_map.h"
 #include "utils/bititer.h"
 #include "utils/exception.h"
 
@@ -160,99 +160,30 @@ MetalNetwork::MetalNetwork(const WeightsFile& file, const OptionsDict& options)
                     "' does not exist in this net.");
   }
 
-  auto embedding = static_cast<InputEmbedding>(file.format().network_format().input_embedding());
-  builder_->build(kInputPlanes, weights, embedding, attn_body, attn_policy_, conv_policy_,
-                  wdl_, moves_left_, activations, policy_head, value_head);
+  auto embedding = static_cast<InputEmbedding>(
+      file.format().network_format().input_embedding());
+  builder_->build(kInputPlanes, weights, embedding, attn_body, attn_policy_,
+                  conv_policy_, wdl_, moves_left_, activations, policy_head,
+                  value_head);
 }
 
 void MetalNetwork::forwardEval(InputsOutputs* io, int batchSize) {
-  // Expand encoded input into N x 112 x 8 x 8.
-  float* dptr = &io->input_val_mem_expanded_[0];
-  for (size_t i = 0; i < batchSize; i++) {
-    for (size_t j = 0; j < kInputPlanes; j++) {
-      const float value = io->input_val_mem_[j + i * kInputPlanes];
-      const uint64_t mask = io->input_masks_mem_[j + i * kInputPlanes];
-      for (auto k = 0; k < 64; k++) {
-        *(dptr++) = (mask & (((uint64_t)1) << k)) != 0 ? value : 0;
-      }
-    }
-  }
-
   // Metal is not thread-safe, so lock is needed.
   lock_.lock();
 
-  if (attn_policy_ || conv_policy_) {
-    /**
-     * @todo policy map implementation has bug in MPSGraph (GatherND not working
-     * in graph). Implementation of policy map to be done in CPU for now.
-     *
-     * Remove this if-branch when bug is fixed. See comments above.
-     */
-
-    if (moves_left_) {
-      builder_->forwardEval(&io->input_val_mem_expanded_[0], batchSize,
-                            {&io->op_policy_raw_mem_[0], &io->op_value_mem_[0],
-                             &io->op_moves_left_mem_[0]});
-    } else {
-      builder_->forwardEval(
-          &io->input_val_mem_expanded_[0], batchSize,
-          {&io->op_policy_raw_mem_[0], &io->op_value_mem_[0]});
-    }
-    // The next thread can start using the GPU now.
-    lock_.unlock();
-
-    if (attn_policy_) {
-      // Promotion offset calculation.
-      for (size_t batch = 0; batch < batchSize; batch++) {
-        for (int k = 0; k < 8; k++) {      // y in cuda
-          for (int j = 0; j < 8; j++) {    // w in cuda
-            for (int i = 0; i < 3; i++) {  // c in cuda
-              // Promotion offsets already precalculated and stored in GPU.
-              // Just the main policy offsets need to be added here.
-              io->op_policy_raw_mem_[batch * (64 * 64 + 8 * 24) + 64 * 64 +
-                                     24 * k + 3 * j + i] +=
-                  io->op_policy_raw_mem_[batch * (64 * 64 + 8 * 24) +
-                                         (48 + k) * 64 + 56 + j];
-            }
-          }
-        }
-      }
-      // Mapping from attention policy to lc0 policy
-      for (size_t batch = 0; batch < batchSize; batch++) {
-        for (size_t i = 0; i < 64 * 64 + 8 * 24; i++) {
-          size_t j = kAttnPolicyMap[i];
-          if (j >= 0) {
-            io->op_policy_mem_[batch * 1858 + j] =
-                io->op_policy_raw_mem_[batch * (64 * 64 + 8 * 24) + i];
-          }
-        }
-      }
-    } else if (conv_policy_) {
-      // Mapping from convolutional policy to lc0 policy
-      for (size_t batch = 0; batch < batchSize; batch++) {
-        for (size_t i = 0; i < 73 * 64; i++) {
-          short j = kConvPolicyMap[i];
-          if (j >= 0) {
-            io->op_policy_mem_[batch * 1858 + j] =
-                io->op_policy_raw_mem_[batch * 80 * 64 + i];
-          }
-        }
-      }
-    }
-
+  if (moves_left_) {
+    builder_->forwardEval(&io->input_val_mem_[0], &io->input_masks_mem_[0],
+                          batchSize,
+                          {&io->op_policy_mem_[0], &io->op_value_mem_[0],
+                           &io->op_moves_left_mem_[0]});
   } else {
-    if (moves_left_) {
-      builder_->forwardEval(&io->input_val_mem_expanded_[0], batchSize,
-                            {&io->op_policy_mem_[0], &io->op_value_mem_[0],
-                             &io->op_moves_left_mem_[0]});
-    } else {
-      builder_->forwardEval(&io->input_val_mem_expanded_[0], batchSize,
-                            {&io->op_policy_mem_[0], &io->op_value_mem_[0]});
-    }
-
-    // The next thread can start using the GPU now.
-    lock_.unlock();
+    builder_->forwardEval(&io->input_val_mem_[0], &io->input_masks_mem_[0],
+                          batchSize,
+                          {&io->op_policy_mem_[0], &io->op_value_mem_[0]});
   }
+
+  // The next thread can start using the GPU now.
+  lock_.unlock();
 }
 
 std::unique_ptr<Network> MakeMetalNetwork(const std::optional<WeightsFile>& w,
diff --git a/src/neural/metal/network_metal.h b/src/neural/backends/metal/network_metal.h
similarity index 100%
rename from src/neural/metal/network_metal.h
rename to src/neural/backends/metal/network_metal.h
diff --git a/src/neural/network_check.cc b/src/neural/backends/network_check.cc
similarity index 99%
rename from src/neural/network_check.cc
rename to src/neural/backends/network_check.cc
index 1b67266cff..c779acc4ab 100644
--- a/src/neural/network_check.cc
+++ b/src/neural/backends/network_check.cc
@@ -30,6 +30,7 @@
 #include <iomanip>
 
 #include "neural/decoder.h"
+#include "neural/encoder.h"
 #include "neural/factory.h"
 #include "neural/network.h"
 #include "utils/histogram.h"
@@ -123,7 +124,7 @@ class CheckComputation : public NetworkComputation {
     std::vector<float> policy;
     policy.reserve(moves.size());
     for (const auto move : moves) {
-      policy.emplace_back(comp->GetPVal(sample, move.as_nn_index(0)));
+      policy.emplace_back(comp->GetPVal(sample, MoveToNNIndex(move, 0)));
       max_p = std::max(max_p, policy.back());
     }
     float total = 0;
diff --git a/src/neural/backends/network_demux.cc b/src/neural/backends/network_demux.cc
new file mode 100644
index 0000000000..accf9bd12f
--- /dev/null
+++ b/src/neural/backends/network_demux.cc
@@ -0,0 +1,344 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2018-2020 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include <algorithm>
+#include <atomic>
+#include <condition_variable>
+#include <cstdlib>
+#include <mutex>
+#include <queue>
+#include <thread>
+
+#include "neural/factory.h"
+
+namespace lczero {
+namespace {
+
+class DemuxingComputation;
+
+struct DemuxingWork {
+  DemuxingComputation* source_ = nullptr;
+  std::unique_ptr<NetworkComputation> computation_;
+  int start_ = 0;
+  int end_ = 0;
+
+  DemuxingWork(int sample) : end_(sample) {}
+  DemuxingWork(DemuxingComputation* source, int start, int end)
+      : source_(source), start_(start), end_(end) {
+    assert(start_ != end_);
+  }
+
+  auto operator<=>(const DemuxingWork& b) const { return end_ <=> b.end_; }
+};
+
+class DemuxingNetwork;
+class DemuxingBackend;
+class DemuxingComputation final : public NetworkComputation {
+  std::tuple<const std::unique_ptr<NetworkComputation>&, int> GetParent(
+      int sample) const {
+    auto iter = std::lower_bound(parents_.begin(), parents_.end(), sample + 1);
+    assert(iter != parents_.end());
+    assert(sample >= iter->start_);
+    assert(sample < iter->end_);
+    return {iter->computation_, sample - iter->start_};
+  }
+
+ public:
+  DemuxingComputation(DemuxingNetwork* network) : network_(network) {}
+  ~DemuxingComputation() {
+    // Wait for other threads to stop using this object. It must be spinloop for
+    // correct synchronization between notify_one and destructor.
+    while (dataready_.load(std::memory_order_acquire) != -1) {
+      SpinloopPause();
+    }
+  }
+
+  void AddInput(InputPlanes&& input) override {
+    planes_.emplace_back(std::move(input));
+  }
+
+  void ComputeBlocking() override;
+
+  int GetBatchSize() const override { return planes_.size(); }
+
+  float GetQVal(int sample) const override {
+    auto [parent, offset] = GetParent(sample);
+    if (!parent) return 0;
+    return parent->GetQVal(offset);
+  }
+
+  float GetDVal(int sample) const override {
+    auto [parent, offset] = GetParent(sample);
+    if (!parent) return 0;
+    return parent->GetDVal(offset);
+  }
+
+  float GetMVal(int sample) const override {
+    auto [parent, offset] = GetParent(sample);
+    if (!parent) return 0;
+    return parent->GetMVal(offset);
+  }
+
+  float GetPVal(int sample, int move_id) const override {
+    auto [parent, offset] = GetParent(sample);
+    if (!parent) return 0;
+    return parent->GetPVal(offset, move_id);
+  }
+
+  void NotifyComplete() {
+    if (1 == dataready_.fetch_sub(1, std::memory_order_release)) {
+      {
+        std::lock_guard lock(mutex_);
+      }
+      dataready_cv_.notify_one();
+      dataready_.store(-1, std::memory_order_release);
+    }
+  }
+
+ private:
+  std::vector<InputPlanes> planes_;
+  DemuxingNetwork* network_;
+  std::vector<DemuxingWork> parents_;
+
+  std::mutex mutex_;
+  std::condition_variable dataready_cv_;
+  std::atomic<int> dataready_ = -1;
+
+  friend class DemuxingBackend;
+};
+
+class DemuxingBackend {
+ public:
+  ~DemuxingBackend() {
+    while (!threads_.empty()) {
+      threads_.back().join();
+      threads_.pop_back();
+    }
+    while (!queue_.empty()) {
+      queue_.front()->source_->NotifyComplete();
+      queue_.pop();
+    }
+  }
+
+  void Assign(std::unique_ptr<Network>&& network, const OptionsDict& opts,
+              std::atomic<bool>& abort) {
+    network_ = std::move(network);
+    int nn_threads = opts.GetOrDefault<int>("threads", 0);
+    if (nn_threads == 0) {
+      nn_threads = network_->GetThreads();
+    }
+    for (int i = 0; i < nn_threads; i++) {
+      threads_.emplace_back([&] { Worker(abort); });
+    }
+  }
+
+  void Enqueue(DemuxingWork* work) {
+    {
+      std::unique_lock lock(mutex_);
+      queue_.push(work);
+    }
+    dataready_cv_.notify_one();
+  }
+
+  void Abort() {
+    {
+      std::unique_lock lock(mutex_);
+    }
+    dataready_cv_.notify_all();
+  }
+
+  void Worker(std::atomic<bool>& abort) {
+    while (!abort.load(std::memory_order_relaxed)) {
+      DemuxingWork* work = nullptr;
+      {
+        std::unique_lock lock(mutex_);
+        dataready_cv_.wait(lock, [&] {
+          return abort.load(std::memory_order_relaxed) || !queue_.empty();
+        });
+        if (abort.load(std::memory_order_relaxed)) return;
+        if (!queue_.empty()) {
+          work = queue_.front();
+          queue_.pop();
+        }
+      }
+      if (work) {
+        work->computation_ = network_->NewComputation();
+        auto& planes = work->source_->planes_;
+        for (int i = work->start_; i < work->end_; i++) {
+          work->computation_->AddInput(std::move(planes[i]));
+        }
+        work->computation_->ComputeBlocking();
+        work->source_->NotifyComplete();
+      }
+    }
+  }
+
+ private:
+  std::mutex mutex_;
+  std::condition_variable dataready_cv_;
+  std::vector<std::thread> threads_;
+  std::unique_ptr<Network> network_;
+  std::queue<DemuxingWork*> queue_;
+};
+
+class DemuxingNetwork final : public Network {
+ public:
+  DemuxingNetwork(const std::optional<WeightsFile>& weights,
+                  const OptionsDict& options)
+      : backends_(std::max(size_t(1), options.ListSubdicts().size())) {
+    const auto parents = options.ListSubdicts();
+    if (parents.empty()) {
+      // If options are empty, or multiplexer configured in root object,
+      // initialize on root object and default backend.
+      auto backends = NetworkFactory::Get()->GetBackendsList();
+      AddBackend(0, backends[0], weights, options);
+    }
+
+    int i = 0;
+    for (const auto& name : parents) {
+      AddBackend(i++, name, weights, options.GetSubdict(name));
+    }
+  }
+
+  void AddBackend(int index, const std::string& name,
+                  const std::optional<WeightsFile>& weights,
+                  const OptionsDict& opts) {
+    const std::string backend = opts.GetOrDefault<std::string>("backend", name);
+
+    auto network = NetworkFactory::Get()->Create(backend, weights, opts);
+
+    min_batch_size_ = std::min(min_batch_size_, network->GetMiniBatchSize());
+    batch_step_ = std::max(batch_step_, network->GetPreferredBatchStep());
+    is_cpu_ &= network->IsCpu();
+    if (index == 0) {
+      capabilities_ = network->GetCapabilities();
+    } else {
+      capabilities_.Merge(network->GetCapabilities());
+    }
+    backends_[index].Assign(std::move(network), opts, abort_);
+  }
+
+  std::unique_ptr<NetworkComputation> NewComputation() override {
+    return std::make_unique<DemuxingComputation>(this);
+  }
+
+  const NetworkCapabilities& GetCapabilities() const override {
+    return capabilities_;
+  }
+
+  int GetMiniBatchSize() const override {
+    return min_batch_size_ * backends_.size();
+  }
+
+  int GetPreferredBatchStep() const override { return batch_step_; }
+
+  bool IsCpu() const override { return is_cpu_; }
+
+  ~DemuxingNetwork() { Abort(); }
+
+  void Abort() {
+    abort_.store(true, std::memory_order_relaxed);
+    for (auto& b : backends_) {
+      b.Abort();
+    }
+  }
+
+  std::vector<DemuxingBackend> backends_;
+  NetworkCapabilities capabilities_;
+  int min_batch_size_ = std::numeric_limits<int>::max();
+  int batch_step_ = 1;
+  bool is_cpu_ = true;
+  std::atomic<int64_t> start_index_;
+  std::atomic<bool> abort_ = false;
+};
+
+void DemuxingComputation::ComputeBlocking() {
+  if (GetBatchSize() == 0) return;
+  // Calculate batch_step_ size split count.
+  int splits = 1 + (GetBatchSize() - 1) / network_->batch_step_;
+  // Calculate the minimum number of splits per backend.
+  int split_size_per_backend = splits / network_->backends_.size();
+  // Calculate how many backends get extra work.
+  int extra_split_backends =
+      splits - split_size_per_backend * network_->backends_.size();
+
+  // Find the first backend which got less work from the previous batch.
+  int start_index =
+      network_->start_index_.fetch_add(std::max(1, extra_split_backends),
+                                       std::memory_order_relaxed) %
+      network_->backends_.size();
+
+  int end_index =
+      (start_index + extra_split_backends) % network_->backends_.size();
+  int work_start = 0;
+  int work_items = split_size_per_backend > 0 ? network_->backends_.size()
+                                             : extra_split_backends;
+  // First store the work item count and reserve memory from them.
+  dataready_.store(work_items, std::memory_order_relaxed);
+  parents_.reserve(work_items);
+  int i = start_index;
+  // First send work to backends which get extra work.
+  int split_size = split_size_per_backend + 1;
+  for (; i != end_index; i = (i + 1) % network_->backends_.size()) {
+    assert(work_start != GetBatchSize());
+    int work_end = work_start + split_size * network_->batch_step_;
+    work_end = std::min(work_end, GetBatchSize());
+    parents_.emplace_back(this, work_start, work_end);
+    network_->backends_[i].Enqueue(&parents_.back());
+    work_start = work_end;
+  }
+  // Queue remaining work items which don't get extra work.
+  split_size--;
+  if (split_size > 0) {
+    do {
+      assert(work_start != GetBatchSize());
+      int work_end = work_start + split_size * network_->batch_step_;
+      work_end = std::min(work_end, GetBatchSize());
+      parents_.emplace_back(this, work_start, work_end);
+      network_->backends_[i].Enqueue(&parents_.back());
+      work_start = work_end;
+      i = (i + 1) % network_->backends_.size();
+    } while (i != start_index);
+  }
+  assert(work_start == GetBatchSize());
+  assert(work_items == (int)parents_.size());
+  // Wait until all backends complete their work.
+  std::unique_lock<std::mutex> lock(mutex_);
+  dataready_cv_.wait(lock, [this]() {
+    return dataready_.load(std::memory_order_acquire) <= 0;
+  });
+}
+
+std::unique_ptr<Network> MakeDemuxingNetwork(
+    const std::optional<WeightsFile>& weights, const OptionsDict& options) {
+  return std::make_unique<DemuxingNetwork>(weights, options);
+}
+
+REGISTER_NETWORK("demux", MakeDemuxingNetwork, -1001)
+
+}  // namespace
+}  // namespace lczero
diff --git a/src/neural/network_mux.cc b/src/neural/backends/network_mux.cc
similarity index 100%
rename from src/neural/network_mux.cc
rename to src/neural/backends/network_mux.cc
diff --git a/src/neural/network_random.cc b/src/neural/backends/network_random.cc
similarity index 100%
rename from src/neural/network_random.cc
rename to src/neural/backends/network_random.cc
diff --git a/src/neural/network_record.cc b/src/neural/backends/network_record.cc
similarity index 100%
rename from src/neural/network_record.cc
rename to src/neural/backends/network_record.cc
diff --git a/src/neural/network_rr.cc b/src/neural/backends/network_rr.cc
similarity index 100%
rename from src/neural/network_rr.cc
rename to src/neural/backends/network_rr.cc
diff --git a/src/neural/network_tf_cc.cc b/src/neural/backends/network_tf_cc.cc
similarity index 99%
rename from src/neural/network_tf_cc.cc
rename to src/neural/backends/network_tf_cc.cc
index f7b5cad65f..68d1c0fb9d 100644
--- a/src/neural/network_tf_cc.cc
+++ b/src/neural/backends/network_tf_cc.cc
@@ -38,7 +38,7 @@
 
 #include "neural/factory.h"
 #include "neural/network_legacy.h"
-#include "neural/shared/policy_map.h"
+#include "neural/tables/policy_map.h"
 #include "utils/bititer.h"
 #include "utils/optionsdict.h"
 #include "utils/transpose.h"
diff --git a/src/neural/network_trivial.cc b/src/neural/backends/network_trivial.cc
similarity index 99%
rename from src/neural/network_trivial.cc
rename to src/neural/backends/network_trivial.cc
index 46f48d1f99..a433d63e5e 100644
--- a/src/neural/network_trivial.cc
+++ b/src/neural/backends/network_trivial.cc
@@ -403,7 +403,8 @@ float DotProduct(uint64_t plane, const std::array<float, 64>& weights) {
 }
 
 int NumBits(uint64_t x) {
-  return std::distance(BitIterator<int>(x), BitIterator<int>(0));
+  using Iterator = BitIterator<int>;
+  return std::distance(Iterator(x), Iterator(0));
 }
 
 class TrivialNetworkComputation : public NetworkComputation {
diff --git a/src/neural/onednn/layers.cc b/src/neural/backends/onednn/layers.cc
similarity index 100%
rename from src/neural/onednn/layers.cc
rename to src/neural/backends/onednn/layers.cc
diff --git a/src/neural/onednn/layers.h b/src/neural/backends/onednn/layers.h
similarity index 99%
rename from src/neural/onednn/layers.h
rename to src/neural/backends/onednn/layers.h
index 64b0096fc1..0192096a06 100644
--- a/src/neural/onednn/layers.h
+++ b/src/neural/backends/onednn/layers.h
@@ -26,7 +26,7 @@
 */
 #pragma once
 
-#include "neural/shared/activation.h"
+#include "neural/tables/activation_function.h"
 #include "utils/exception.h"
 
 #include "dnnl.hpp"
diff --git a/src/neural/onednn/network_onednn.cc b/src/neural/backends/onednn/network_onednn.cc
similarity index 99%
rename from src/neural/onednn/network_onednn.cc
rename to src/neural/backends/onednn/network_onednn.cc
index 3f03cbd054..7640cc20e5 100644
--- a/src/neural/onednn/network_onednn.cc
+++ b/src/neural/backends/onednn/network_onednn.cc
@@ -35,8 +35,8 @@
 #include "layers.h"
 #include "neural/factory.h"
 #include "neural/network_legacy.h"
-#include "neural/shared/attention_policy_map.h"
-#include "neural/shared/policy_map.h"
+#include "neural/tables/attention_policy_map.h"
+#include "neural/tables/policy_map.h"
 #include "utils/bititer.h"
 #include "utils/exception.h"
 
@@ -174,13 +174,13 @@ class OnednnNetwork : public Network {
         options.GetOrDefault<int>("jit_cache", 1024));
 #endif
 
-    if (!options.IsDefault<int>("threads")) {
+    if (options.Exists<int>("threads")) {
       omp_set_num_threads(options.Get<int>("threads"));
     }
 
     cpu_eng_ = dnnl::engine(dnnl::engine::kind::cpu, 0);
 
-    if (!options.IsDefault<int>("gpu")) {
+    if (options.Exists<int>("gpu")) {
       eng_ = dnnl::engine(dnnl::engine::kind::gpu, options.Get<int>("gpu"));
     } else {
       eng_ = cpu_eng_;
@@ -201,7 +201,7 @@ class OnednnNetwork : public Network {
     // on gpu and not on cpu (last tested with version 2.6.0). So for the time
     // being this will be overriden in every case.
     auto convolution_type = dnnl::algorithm::convolution_auto;
-    if (!options.IsDefault<bool>("winograd")) {
+    if (options.Exists<bool>("winograd")) {
       if (options.Get<bool>("winograd")) {
         convolution_type = dnnl::algorithm::convolution_winograd;
       } else {
diff --git a/src/neural/backends/onnx/network_onnx.cc b/src/neural/backends/onnx/network_onnx.cc
new file mode 100644
index 0000000000..f5ac887cda
--- /dev/null
+++ b/src/neural/backends/onnx/network_onnx.cc
@@ -0,0 +1,966 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2021-2023 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include <algorithm>
+#include <cassert>
+#include <filesystem>
+#include <fstream>
+#include <iomanip>
+#include <iterator>
+#include <list>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "onnx_conf.h"
+
+#ifdef USE_ONNX_CUDART
+#include "cuda_runtime.h"
+#include "neural/backends/onnx/onnx_kernels.h"
+#endif
+
+#include "neural/factory.h"
+#include "neural/loader.h"
+#include "neural/network.h"
+#include "neural/onnx/converter.h"
+#include "onnxruntime_cxx_api.h"
+#include "utils/bf16_utils.h"
+#include "utils/bititer.h"
+#include "utils/commandline.h"
+#include "utils/exception.h"
+#include "utils/fp16_utils.h"
+#include "utils/logging.h"
+#include "utils/trace.h"
+
+namespace lczero {
+namespace onnx {
+
+enum class OnnxProvider { CPU, CUDA, DML, ROCM, TRT, MIGRAPHX };
+
+class OnnxNetwork;
+
+static constexpr int kNumOutputPolicy = 1858;
+
+struct InputsOutputs {
+  InputsOutputs(OnnxNetwork* network);
+  ~InputsOutputs() {
+    switch (provider_) {
+      case OnnxProvider::CUDA:
+      case OnnxProvider::TRT:
+#ifdef USE_ONNX_CUDART
+        ReportCUDAErrors(cudaEventDestroy(inputs_uploaded_event_));
+        ReportCUDAErrors(cudaEventDestroy(inputs_processed_event_));
+        ReportCUDAErrors(cudaEventDestroy(evaluation_done_event_));
+        ReportCUDAErrors(cudaEventDestroy(outputs_download_event_));
+        ReportCUDAErrors(cudaFree(input_tensor_upload_device_));
+        ReportCUDAErrors(cudaFree(input_tensor_data_device_));
+        for (void* ptr : output_tensors_data_device_) {
+          ReportCUDAErrors(cudaFree(ptr));
+        }
+        ReportCUDAErrors(cudaFreeHost(input_tensor_data_));
+        for (void* ptr : output_tensors_data_) {
+          ReportCUDAErrors(cudaFreeHost(ptr));
+        }
+        break;
+#endif
+      default:
+        free(input_tensor_data_);
+        for (void* ptr : output_tensors_data_) {
+          free(ptr);
+        }
+    }
+  }
+  OnnxProvider provider_;
+  void* input_tensor_data_;
+  void* input_tensor_upload_device_;
+  void* input_tensor_data_device_;
+  std::vector<void*> output_tensors_data_;
+  std::vector<void*> output_tensors_data_device_;
+  std::vector<size_t> output_tensors_step_;
+  // To be removed when converting to new backend interface.
+  std::vector<float> wdl_output_data_;
+  Ort::MemoryInfo memory_info_{nullptr};
+#ifdef USE_ONNX_CUDART
+  cudaEvent_t inputs_uploaded_event_ = nullptr;
+  cudaEvent_t inputs_processed_event_ = nullptr;
+  cudaEvent_t evaluation_done_event_ = nullptr;
+  cudaEvent_t outputs_download_event_ = nullptr;
+#endif
+};
+
+template <typename DataType>
+class OnnxComputation final : public NetworkComputation {
+ public:
+  OnnxComputation(OnnxNetwork* network);
+  ~OnnxComputation();
+  void AddInput(InputPlanes&& input) override;
+  int GetBatchSize() const override;
+  void ComputeBlocking() override;
+  float GetQVal(int sample) const override;
+  float GetDVal(int sample) const override;
+  float GetPVal(int sample, int move_id) const override;
+  float GetMVal(int sample) const override;
+
+ private:
+  Ort::IoBinding PrepareInputs(int start, int batch_size, int step);
+
+  OnnxNetwork* network_;
+  size_t input_size_ = 0;
+  std::vector<InputPlanes> raw_input_;
+  std::unique_ptr<InputsOutputs> inputs_outputs_;
+};
+
+class OnnxNetwork final : public Network {
+ public:
+  OnnxNetwork(const WeightsFile& file, const OptionsDict& options,
+              OnnxProvider provider, bool cpu_wdl);
+  ~OnnxNetwork();
+  std::unique_ptr<NetworkComputation> NewComputation() override {
+#ifdef USE_ONNX_CUDART
+    if (provider_ == OnnxProvider::CUDA || provider_ == OnnxProvider::TRT) {
+      int device = -1;
+      ReportCUDAErrors(cudaGetDevice(&device));
+      if (device != gpu_) {
+        ReportCUDAErrors(cudaSetDevice(gpu_));
+      }
+    }
+#endif
+    if (fp16_) {
+      return std::make_unique<OnnxComputation<Ort::Float16_t>>(this);
+    } else if (bf16_) {
+      return std::make_unique<OnnxComputation<Ort::BFloat16_t>>(this);
+    } else {
+      return std::make_unique<OnnxComputation<float>>(this);
+    }
+  }
+  const NetworkCapabilities& GetCapabilities() const override {
+    return capabilities_;
+  }
+  int GetMiniBatchSize() const override {
+    return batch_size_ == -1 ? Network::GetMiniBatchSize()
+                             : batch_size_ * steps_;
+  }
+  int GetPreferredBatchStep() const override {
+    return batch_size_ == -1 ? min_batch_size_ : batch_size_;
+  }
+  bool IsCpu() const override { return provider_ == OnnxProvider::CPU; }
+
+  Ort::SessionOptions GetOptions(int threads, int batch_size, uint64_t hash, int optimize);
+
+  std::unique_ptr<InputsOutputs> GetInputsOutputs() {
+    std::lock_guard<std::mutex> lock(inputs_outputs_lock_);
+    if (free_inputs_outputs_.empty()) {
+      return std::make_unique<InputsOutputs>(this);
+    } else {
+      std::unique_ptr<InputsOutputs> resource =
+          std::move(free_inputs_outputs_.front());
+      free_inputs_outputs_.pop_front();
+      return resource;
+    }
+  }
+
+  void ReleaseInputsOutputs(std::unique_ptr<InputsOutputs> resource) {
+    std::lock_guard<std::mutex> lock(inputs_outputs_lock_);
+    free_inputs_outputs_.push_back(std::move(resource));
+  }
+
+  Ort::Env onnx_env_;
+  // Prepare sessions for this many multiples of the batch size;
+  int steps_;
+  std::vector<Ort::Session> session_;
+  std::vector<std::string> inputs_;
+  std::vector<std::string> outputs_;
+  // Indices in output_ vector.
+  int policy_head_ = -1;
+  int wdl_head_ = -1;
+  int value_head_ = -1;
+  int mlh_head_ = -1;
+  NetworkCapabilities capabilities_;
+  bool fp16_;
+  bool bf16_;
+  bool cpu_wdl_;
+  // The batch size to use, or -1 for variable.
+  int batch_size_;
+  // The lower limit for variable batch size.
+  int min_batch_size_;
+  int gpu_;
+  static constexpr int max_batch_size_ = 1024;
+  // For conditional locking if running the DML/ROCM/TRT provider.
+  OnnxProvider provider_;
+  std::mutex lock_;
+  // For shared device addresses.
+#ifdef USE_ONNX_CUDART
+  cudaStream_t compute_stream_ = nullptr;
+  cudaStream_t upload_stream_ = nullptr;
+  cudaStream_t download_stream_ = nullptr;
+#endif
+
+ private:
+  std::mutex inputs_outputs_lock_;
+  std::list<std::unique_ptr<InputsOutputs>> free_inputs_outputs_;
+};
+
+InputsOutputs::InputsOutputs(OnnxNetwork* network)
+    : provider_(network->provider_) {
+  int max_batch_size = network->max_batch_size_;
+  int value_head = network->value_head_;
+  int wdl_head = network->wdl_head_;
+  int policy_head = network->policy_head_;
+  int mlh_head = network->mlh_head_;
+  int data_size = (network->fp16_ | network->bf16_) ? 2 : 4;
+  int outputs_size =
+      std::max({value_head, wdl_head, policy_head, mlh_head}) + 1;
+  output_tensors_data_.resize(outputs_size);
+  output_tensors_data_device_.resize(outputs_size);
+  output_tensors_step_.resize(outputs_size);
+  if (wdl_head != -1) {
+    wdl_output_data_.resize(3 * max_batch_size);
+  }
+  output_tensors_step_[policy_head] = kNumOutputPolicy;
+  if (wdl_head != -1) {
+    output_tensors_step_[wdl_head] = 3;
+  }
+  if (value_head != -1) {
+    output_tensors_step_[value_head] = 1;
+  }
+  if (mlh_head != -1) {
+    output_tensors_step_[mlh_head] = 1;
+  }
+
+  switch (provider_) {
+    case OnnxProvider::CUDA:
+    case OnnxProvider::TRT:
+#ifdef USE_ONNX_CUDART
+      ReportCUDAErrors(
+          cudaEventCreate(&inputs_processed_event_, cudaEventDisableTiming));
+      ReportCUDAErrors(
+          cudaEventCreate(&inputs_uploaded_event_, cudaEventDisableTiming));
+      ReportCUDAErrors(
+          cudaEventCreate(&evaluation_done_event_, cudaEventDisableTiming));
+      ReportCUDAErrors(
+          cudaEventCreate(&outputs_download_event_, cudaEventDisableTiming));
+      ReportCUDAErrors(
+          cudaHostAlloc(&input_tensor_data_,
+                        max_batch_size * kInputPlanes * sizeof(InputPlane), 0));
+      for (int i = 0; i < outputs_size; i++) {
+        ReportCUDAErrors(cudaHostAlloc(
+            &output_tensors_data_[i],
+            max_batch_size * output_tensors_step_[i] * data_size, 0));
+      }
+
+      output_tensors_data_device_.resize(outputs_size);
+      ReportCUDAErrors(
+          cudaMalloc(&input_tensor_upload_device_,
+                     max_batch_size * kInputPlanes * sizeof(InputPlane)));
+      ReportCUDAErrors(
+          cudaMalloc(&input_tensor_data_device_,
+                     max_batch_size * kInputPlanes * 8 * 8 * data_size));
+      for (int i = 0; i < outputs_size; i++) {
+        ReportCUDAErrors(
+            cudaMalloc(&output_tensors_data_device_[i],
+                       max_batch_size * output_tensors_step_[i] * data_size));
+      }
+      memory_info_ = Ort::MemoryInfo{"Cuda", OrtDeviceAllocator, network->gpu_,
+                                     OrtMemTypeDefault};
+      break;
+#endif
+    default:
+      input_tensor_data_ =
+          malloc(max_batch_size * kInputPlanes * 8 * 8 * data_size);
+      for (int i = 0; i < outputs_size; i++) {
+        output_tensors_data_[i] =
+            malloc(max_batch_size * output_tensors_step_[i] * data_size);
+      }
+      input_tensor_data_device_ = input_tensor_data_;
+      for (int i = 0; i < outputs_size; i++) {
+        output_tensors_data_device_[i] = output_tensors_data_[i];
+      }
+      memory_info_ =
+          Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
+  }
+}
+
+OnnxNetwork::~OnnxNetwork() {
+#ifdef USE_ONNX_CUDART
+  if (provider_ == OnnxProvider::TRT || provider_ == OnnxProvider::CUDA) {
+    ReportCUDAErrors(cudaStreamDestroy(compute_stream_));
+    ReportCUDAErrors(cudaStreamDestroy(upload_stream_));
+    ReportCUDAErrors(cudaStreamDestroy(download_stream_));
+  }
+#endif
+}
+
+template <typename DataType>
+OnnxComputation<DataType>::OnnxComputation(OnnxNetwork* network)
+    : network_(network) {
+  inputs_outputs_ = network_->GetInputsOutputs();
+}
+
+template <typename DataType>
+OnnxComputation<DataType>::~OnnxComputation() {
+  network_->ReleaseInputsOutputs(std::move(inputs_outputs_));
+}
+
+void AsDataType(float x, float* y) { *y = x; }
+void AsDataType(float x, Ort::Float16_t* y) {
+  uint16_t tmp = FP32toFP16(x);
+  std::memcpy(reinterpret_cast<uint16_t*>(y), &tmp, sizeof(uint16_t));
+}
+void AsDataType(float x, Ort::BFloat16_t* y) {
+  uint16_t tmp = FP32toBF16(x);
+  std::memcpy(reinterpret_cast<uint16_t*>(y), &tmp, sizeof(uint16_t));
+}
+
+template <typename DataType>
+void OnnxComputation<DataType>::AddInput(InputPlanes&& input) {
+  if (input_size_ >= network_->max_batch_size_) {
+    throw Exception("NN input exceeds max batch size of " +
+                    std::to_string(network_->max_batch_size_) + ".");
+  }
+#ifdef USE_ONNX_CUDART
+  if (network_->provider_ == OnnxProvider::CUDA ||
+      network_->provider_ == OnnxProvider::TRT) {
+    assert(input.size() == kInputPlanes);
+    uint64_t* masks =
+        static_cast<uint64_t*>(inputs_outputs_->input_tensor_data_) +
+        input_size_ * kInputPlanes;
+    uint64_t* mask_end =
+        static_cast<uint64_t*>(inputs_outputs_->input_tensor_data_) +
+        network_->max_batch_size_ * kInputPlanes;
+    DataType* values =
+        reinterpret_cast<DataType*>(mask_end) + input_size_ * kInputPlanes;
+    for (size_t i = 0; i < kInputPlanes; i++) {
+      masks[i] = input[i].mask;
+      DataType value;
+      AsDataType(input[i].value, &value);
+      values[i] = value;
+    }
+    input_size_++;
+    if (input_size_ > network_->max_batch_size_) {
+      throw Exception("NN input exceeds max batch size of " +
+                      std::to_string(network_->max_batch_size_) + ".");
+    }
+    return;
+  }
+#endif
+  raw_input_.emplace_back(std::move(input));
+  input_size_++;
+}
+template <typename DataType>
+int OnnxComputation<DataType>::GetBatchSize() const {
+  return input_size_;
+}
+
+float AsFloat(float x) { return x; }
+float AsFloat(Ort::Float16_t x) {
+  uint16_t tmp;
+  std::memcpy(&tmp, reinterpret_cast<uint16_t*>(&x), sizeof(uint16_t));
+  return FP16toFP32(tmp);
+}
+float AsFloat(Ort::BFloat16_t x) {
+  uint16_t tmp;
+  std::memcpy(&tmp, reinterpret_cast<uint16_t*>(&x), sizeof(uint16_t));
+  return BF16toFP32(tmp);
+}
+
+template <typename DataType>
+float OnnxComputation<DataType>::GetQVal(int sample) const {
+  if (network_->wdl_head_ != -1) {
+    return inputs_outputs_->wdl_output_data_[sample * 3 + 0] -
+           inputs_outputs_->wdl_output_data_[sample * 3 + 2];
+  } else {
+    DataType* data = static_cast<DataType*>(
+        inputs_outputs_->output_tensors_data_[network_->value_head_]);
+    return AsFloat(data[sample]);
+  }
+}
+
+template <typename DataType>
+float OnnxComputation<DataType>::GetDVal(int sample) const {
+  if (network_->wdl_head_ == -1) return 0.0f;
+  return inputs_outputs_->wdl_output_data_[sample * 3 + 1];
+}
+
+template <typename DataType>
+float OnnxComputation<DataType>::GetPVal(int sample, int move_id) const {
+  DataType* data = static_cast<DataType*>(
+      inputs_outputs_->output_tensors_data_[network_->policy_head_]);
+  return AsFloat(data[sample * kNumOutputPolicy + move_id]);
+}
+
+template <typename DataType>
+float OnnxComputation<DataType>::GetMVal(int sample) const {
+  if (network_->mlh_head_ == -1) return 0.0f;
+  DataType* data = static_cast<DataType*>(
+      inputs_outputs_->output_tensors_data_[network_->mlh_head_]);
+  return AsFloat(data[sample]);
+}
+
+template <typename DataType>
+Ort::IoBinding OnnxComputation<DataType>::PrepareInputs(int start,
+                                                        int batch_size,
+                                                        int step) {
+#ifdef USE_ONNX_CUDART
+  if (network_->provider_ != OnnxProvider::CUDA &&
+      network_->provider_ != OnnxProvider::TRT)
+#endif
+  {
+    DataType* iter =
+        static_cast<DataType*>(inputs_outputs_->input_tensor_data_);
+    iter += start * kInputPlanes * 8 * 8;
+    std::memset(static_cast<void*>(iter), 0,
+                batch_size * kInputPlanes * 8 * 8 * sizeof(DataType));
+    int end = std::min(start + batch_size, static_cast<int>(input_size_));
+    for (int i = start; i < end; i++) {
+      for (const auto& plane : raw_input_[i]) {
+        DataType value;
+        AsDataType(plane.value, &value);
+        for (auto bit : IterateBits(plane.mask)) {
+          *(iter + bit) = value;
+        }
+        iter += 64;
+      }
+    }
+  }
+
+  Ort::IoBinding binding{network_->session_[step - 1]};
+  for (size_t i = 0; i < inputs_outputs_->output_tensors_step_.size(); i++) {
+    int size = inputs_outputs_->output_tensors_step_[i];
+    int64_t dims[] = {batch_size, size};
+    binding.BindOutput(
+        network_->outputs_[i].c_str(),
+        Ort::Value::CreateTensor<DataType>(
+            inputs_outputs_->memory_info_,
+            static_cast<DataType*>(
+                inputs_outputs_->output_tensors_data_device_[i]) +
+                start * size,
+            size * batch_size, dims, 2));
+  }
+
+  int64_t dims[] = {batch_size, kInputPlanes, 8, 8};
+  binding.BindInput(
+      network_->inputs_[0].c_str(),
+      Ort::Value::CreateTensor<DataType>(
+          inputs_outputs_->memory_info_,
+          static_cast<DataType*>(inputs_outputs_->input_tensor_data_device_) +
+              start * kInputPlanes * 8 * 8,
+          batch_size * kInputPlanes * 8 * 8, dims, 4));
+  return binding;
+}
+
+template <typename DataType>
+void OnnxComputation<DataType>::ComputeBlocking() {
+  LCTRACE_FUNCTION_SCOPE;
+  int batch_size = network_->batch_size_;
+  if (batch_size < 0) {
+    batch_size =
+        std::max(static_cast<int>(input_size_), network_->min_batch_size_);
+  }
+  // Only the DML onnxruntime execution provider is documented as needing
+  // locking, but it seems all GPU backends need it.
+  if (network_->provider_ != OnnxProvider::CPU) {
+    network_->lock_.lock();
+  }
+  for (size_t i = 0; i < (size_t)input_size_;) {
+    int step = (input_size_ - i + batch_size - 1) / batch_size;
+    if (step > network_->steps_) step = network_->steps_;
+    int batch = batch_size * step;
+    if (network_->provider_ == OnnxProvider::TRT && network_->batch_size_ > 0) {
+      batch = std::min((int)input_size_ - (int)i, batch);
+    }
+
+    auto binding = PrepareInputs(i, batch, step);
+
+    Ort::RunOptions options = {};
+#ifdef USE_ONNX_CUDART
+    if (network_->provider_ == OnnxProvider::TRT ||
+        network_->provider_ == OnnxProvider::CUDA) {
+      if (i == 0) {
+        ReportCUDAErrors(
+            cudaStreamWaitEvent(network_->upload_stream_,
+                                inputs_outputs_->inputs_processed_event_));
+      }
+      const char* src_masks =
+          static_cast<char*>(inputs_outputs_->input_tensor_data_);
+      char* dst_masks =
+          static_cast<char*>(inputs_outputs_->input_tensor_upload_device_);
+      src_masks += i * kInputPlanes * sizeof(uint64_t);
+      dst_masks += i * kInputPlanes * (sizeof(uint64_t) + sizeof(DataType));
+      ReportCUDAErrors(cudaMemcpyAsync(
+          dst_masks, src_masks, batch * kInputPlanes * sizeof(uint64_t),
+          cudaMemcpyHostToDevice, network_->upload_stream_));
+      char* src_values =
+          static_cast<char*>(inputs_outputs_->input_tensor_data_);
+      src_values += network_->max_batch_size_ * kInputPlanes * sizeof(uint64_t);
+      src_values += i * kInputPlanes * sizeof(DataType);
+      char* dst_values = dst_masks + batch * kInputPlanes * sizeof(uint64_t);
+      ReportCUDAErrors(cudaMemcpyAsync(
+          dst_values, src_values, batch * kInputPlanes * sizeof(DataType),
+          cudaMemcpyHostToDevice, network_->upload_stream_));
+      ReportCUDAErrors(cudaEventRecord(inputs_outputs_->inputs_uploaded_event_,
+                                       network_->upload_stream_));
+      ReportCUDAErrors(cudaStreamWaitEvent(
+          network_->compute_stream_, inputs_outputs_->inputs_uploaded_event_));
+      if (network_->fp16_) {
+        half* dst =
+            reinterpret_cast<half*>(inputs_outputs_->input_tensor_data_device_);
+        dst += i * kInputPlanes * 8 * 8;
+        expandPlanesOnnx(dst, dst_masks, batch * kInputPlanes,
+                         network_->compute_stream_);
+      } else if (network_->bf16_) {
+        __nv_bfloat16* dst = reinterpret_cast<__nv_bfloat16*>(
+            inputs_outputs_->input_tensor_data_device_);
+        dst += i * kInputPlanes * 8 * 8;
+        expandPlanesOnnx(dst, dst_masks, batch * kInputPlanes,
+                         network_->compute_stream_);
+      } else {
+        float* dst = reinterpret_cast<float*>(
+            inputs_outputs_->input_tensor_data_device_);
+        dst += i * kInputPlanes * 8 * 8;
+        expandPlanesOnnx(dst, dst_masks, batch * kInputPlanes,
+                         network_->compute_stream_);
+      }
+
+      ReportCUDAErrors(cudaEventRecord(inputs_outputs_->inputs_processed_event_,
+                                       network_->upload_stream_));
+      if (i == 0) {
+        ReportCUDAErrors(
+            cudaStreamWaitEvent(network_->compute_stream_,
+                                inputs_outputs_->outputs_download_event_));
+      }
+      options.AddConfigEntry("disable_synchronize_execution_providers", "1");
+    } else
+#endif
+    {
+      binding.SynchronizeInputs();
+    }
+    network_->session_[step - 1].Run(options, binding);
+#ifdef USE_ONNX_CUDART
+    if (network_->provider_ == OnnxProvider::TRT ||
+        network_->provider_ == OnnxProvider::CUDA) {
+      for (size_t j = 0; j < inputs_outputs_->output_tensors_step_.size();
+           j++) {
+        ReportCUDAErrors(
+            cudaEventRecord(inputs_outputs_->evaluation_done_event_,
+                            network_->compute_stream_));
+        ReportCUDAErrors(
+            cudaStreamWaitEvent(network_->download_stream_,
+                                inputs_outputs_->evaluation_done_event_));
+        size_t offset = i * inputs_outputs_->output_tensors_step_[j];
+        ReportCUDAErrors(cudaMemcpyAsync(
+            static_cast<DataType*>(inputs_outputs_->output_tensors_data_[j]) +
+                offset,
+            static_cast<DataType*>(
+                inputs_outputs_->output_tensors_data_device_[j]) +
+                offset,
+            batch * inputs_outputs_->output_tensors_step_[j] * sizeof(DataType),
+            cudaMemcpyDeviceToHost, network_->download_stream_));
+        ReportCUDAErrors(
+            cudaEventRecord(inputs_outputs_->outputs_download_event_,
+                            network_->download_stream_));
+      }
+    } else
+#endif
+    {
+      binding.SynchronizeOutputs();
+    }
+    i += batch;
+  }
+  if (network_->provider_ != OnnxProvider::CPU) {
+    network_->lock_.unlock();
+  }
+#ifdef USE_ONNX_CUDART
+  if (network_->provider_ == OnnxProvider::TRT ||
+      network_->provider_ == OnnxProvider::CUDA) {
+    ReportCUDAErrors(
+        cudaEventSynchronize(inputs_outputs_->outputs_download_event_));
+  }
+#endif
+  if (network_->wdl_head_ != -1) {
+    const DataType* data = static_cast<DataType*>(
+        inputs_outputs_->output_tensors_data_[network_->wdl_head_]);
+    for (size_t i = 0; i < input_size_; i++) {
+      float w = AsFloat(data[i * 3 + 0]);
+      float d = AsFloat(data[i * 3 + 1]);
+      float l = AsFloat(data[i * 3 + 2]);
+      if (network_->cpu_wdl_) {
+        // Value softmax done cpu side.
+        float m = std::max({w, d, l});
+        w = std::exp(w - m);
+        d = std::exp(d - m);
+        l = std::exp(l - m);
+        float sum = w + d + l;
+        w /= sum;
+        l /= sum;
+        d /= sum;
+      }
+      inputs_outputs_->wdl_output_data_[3 * i + 0] = w;
+      inputs_outputs_->wdl_output_data_[3 * i + 1] = d;
+      inputs_outputs_->wdl_output_data_[3 * i + 2] = l;
+    }
+  }
+}
+
+Ort::SessionOptions OnnxNetwork::GetOptions(int threads, int batch_size,
+                                            uint64_t hash, int optimize) {
+  Ort::SessionOptions options;
+  options.SetIntraOpNumThreads(threads);
+  GraphOptimizationLevel level = GraphOptimizationLevel::ORT_DISABLE_ALL;
+  switch (optimize) {
+    case 0:
+      level = GraphOptimizationLevel::ORT_DISABLE_ALL;
+      break;
+    case 1:
+      level = GraphOptimizationLevel::ORT_ENABLE_BASIC;
+      break;
+    case 2:
+      level = GraphOptimizationLevel::ORT_ENABLE_EXTENDED;
+      break;
+    default:
+      level = GraphOptimizationLevel::ORT_ENABLE_ALL;
+      break;
+  }
+  options.SetGraphOptimizationLevel(level);
+
+  if (batch_size > 0 && provider_ != OnnxProvider::TRT) {
+    // Override the default (variable) batch size.
+    Ort::ThrowOnError(
+        OrtGetApiBase()
+            ->GetApi(ORT_API_VERSION)
+            ->AddFreeDimensionOverrideByName(options, "batch", batch_size));
+  }
+
+  switch (provider_) {
+    case OnnxProvider::DML: {
+      std::unordered_map<std::string, std::string> dml_options;
+      dml_options["device_id"] = std::to_string(gpu_);
+      dml_options["performance_preference"] = "high_performance";
+      options.AppendExecutionProvider("DML", dml_options);
+      break;
+    }
+    case OnnxProvider::TRT: {
+      options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
+
+      std::string cache_dir = CommandLine::BinaryDirectory() + "/trt_cache";
+      std::map<std::string, std::string> trt_options;
+      trt_options["device_id"] = std::to_string(gpu_);
+      trt_options["trt_builder_optimization_level"] = std::to_string(std::clamp(optimize, 0, 5));
+      trt_options["trt_fp16_enable"] = optimize >= 6 ? "1" : "0";
+#if ORT_API_VERSION >= 23
+      trt_options["trt_bf16_enable"] = optimize >= 7 ? "1" : "0";
+#endif
+      trt_options["trt_int8_enable"] = optimize >= 8 ? "1" : "0";
+      trt_options["trt_max_partition_iterations"] = "1000";
+      trt_options["trt_min_subgraph_size"] = "1";
+      trt_options["trt_engine_cache_enable"] = "1";
+      // We need the batch size as well as the hash, as it is set after loading.
+      std::ostringstream oss;
+      oss << std::hex << hash;
+      trt_options["trt_engine_cache_prefix"] =
+          "Lc0_ONNX_TRT_ORT_" + Ort::GetVersionString() + "_batch_" +
+          (batch_size < 0 ? std::to_string(batch_size)
+                          : std::to_string(batch_size - batch_size_ + 1) + "-" +
+                                std::to_string(batch_size)) +
+          "_" + std::to_string(optimize) + "_" + oss.str() + "_";
+      trt_options["trt_engine_cache_path"] = cache_dir;
+      trt_options["trt_timing_cache_enable"] = "1";
+      trt_options["trt_timing_cache_path"] = cache_dir;
+      trt_options["trt_layer_norm_fp32_fallback"] = "1";
+      trt_options["trt_force_sequential_engine_build"] = "1";
+      trt_options["trt_context_memory_sharing_enable"] = "1";
+      // Looks like we need I/O binding to enable this.
+#ifdef USE_ONNX_CUDART
+      trt_options["has_user_compute_stream"] = "1";
+#endif
+      if (batch_size < 0) {
+        trt_options["trt_profile_min_shapes"] =
+            inputs_[0] + ":" + std::to_string(min_batch_size_) + "x112x8x8";
+        trt_options["trt_profile_max_shapes"] =
+            inputs_[0] + ":" + std::to_string(max_batch_size_) + "x112x8x8";
+        trt_options["trt_profile_opt_shapes"] =
+            inputs_[0] + ":" + std::to_string(max_batch_size_ / 4) + "x112x8x8";
+      } else {
+        trt_options["trt_profile_min_shapes"] =
+            inputs_[0] + ":" + std::to_string(batch_size - batch_size_ + 1) +
+            "x112x8x8";
+        trt_options["trt_profile_max_shapes"] =
+            inputs_[0] + ":" + std::to_string(batch_size) + "x112x8x8";
+        trt_options["trt_profile_opt_shapes"] =
+            inputs_[0] + ":" + std::to_string(batch_size) + "x112x8x8";
+      }
+      std::vector<const char*> keys;
+      std::vector<const char*> values;
+      for (const auto& [key, value] : trt_options) {
+        keys.push_back(key.c_str());
+        values.push_back(value.c_str());
+      }
+
+      const auto& api = Ort::GetApi();
+      OrtTensorRTProviderOptionsV2* trt_options_v2;
+      Ort::ThrowOnError(api.CreateTensorRTProviderOptions(&trt_options_v2));
+      Ort::ThrowOnError(api.UpdateTensorRTProviderOptions(
+          trt_options_v2, keys.data(), values.data(), keys.size()));
+#ifdef USE_ONNX_CUDART
+      Ort::ThrowOnError(api.UpdateTensorRTProviderOptionsWithValue(
+          trt_options_v2, "user_compute_stream", compute_stream_));
+#endif
+      options.AppendExecutionProvider_TensorRT_V2(*trt_options_v2);
+      api.ReleaseTensorRTProviderOptions(trt_options_v2);
+      break;
+    }
+    case OnnxProvider::ROCM: {
+      OrtROCMProviderOptions rocm_options;
+      rocm_options.device_id = gpu_;
+      options.AppendExecutionProvider_ROCM(rocm_options);
+      break;
+    }
+    case OnnxProvider::MIGRAPHX: {
+      std::unordered_map<std::string, std::string> migraphx_options;
+      migraphx_options["device_id"] = std::to_string(gpu_);
+      migraphx_options["migraphx_exhaustive_tune"] = optimize >= 5 ? "1" : "0";
+      migraphx_options["migraphx_fp16_enable"] = optimize >= 6 ? "1" : "0";
+      migraphx_options["migraphx_bf16_enable"] = optimize >= 7 ? "1" : "0";
+      migraphx_options["migraphx_fp8_enable"] = optimize >= 8 ? "1" : "0";
+      std::filesystem::path cache_dir = CommandLine::BinaryDirectory();
+      cache_dir /= "migraphx_cache";
+
+      if (!std::filesystem::exists(cache_dir)) {
+        std::filesystem::create_directories(cache_dir);
+      }
+      migraphx_options["migraphx_model_cache_dir"] = cache_dir.string();
+
+      options.AppendExecutionProvider("MIGraphX", migraphx_options);
+      break;
+    }
+    case OnnxProvider::CUDA: {
+      OrtCUDAProviderOptions cuda_options;
+      cuda_options.device_id = gpu_;
+#ifdef USE_ONNX_CUDART
+      cuda_options.has_user_compute_stream = true;
+      cuda_options.user_compute_stream = compute_stream_;
+#endif
+      options.AppendExecutionProvider_CUDA(cuda_options);
+      break;
+    }
+    case OnnxProvider::CPU:
+      // The CPU execution provider is always available.
+      break;
+  }
+  return options;
+}
+
+OnnxNetwork::OnnxNetwork(const WeightsFile& file, const OptionsDict& opts,
+                         OnnxProvider provider, bool cpu_wdl)
+    : onnx_env_(ORT_LOGGING_LEVEL_WARNING, "lc0"),
+      capabilities_{file.format().network_format().input(),
+                    file.format().network_format().output(),
+                    file.format().network_format().moves_left()},
+      fp16_(file.onnx_model().data_type() == pblczero::OnnxModel::FLOAT16),
+      bf16_(file.onnx_model().data_type() == pblczero::OnnxModel::BFLOAT16),
+      cpu_wdl_(cpu_wdl),
+      provider_(provider) {
+  onnx_env_.DisableTelemetryEvents();
+
+  gpu_ = opts.GetOrDefault<int>("gpu", 0);
+
+#ifdef USE_ONNX_CUDART
+  if (provider_ == OnnxProvider::CUDA || provider_ == OnnxProvider::TRT) {
+    cudaDeviceProp deviceProp = {};
+    if (!cudaGetDeviceProperties(&deviceProp, gpu_)) {
+      CERR << "GPU: " << deviceProp.name;
+      CERR << "GPU memory: " << deviceProp.totalGlobalMem / std::pow(2.0f, 30)
+           << " Gb";
+      int clockRate = 0;
+      ReportCUDAErrors(
+          cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, gpu_));
+      CERR << "GPU clock frequency: " << clockRate / 1e3f << " MHz";
+    }
+#if CUDART_VERSION >= 12080
+    int runtime_version;
+    ReportCUDAErrors(cudaRuntimeGetVersion(&runtime_version));
+    if (runtime_version >= 12080) {
+      int attr;
+      ReportCUDAErrors(
+          cudaDeviceGetAttribute(&attr, cudaDevAttrGpuPciDeviceId, gpu_));
+      uint32_t pci_device = attr;
+      CERR << "GPU device ID: " << std::hex << (pci_device & 0xffff) << ":"
+           << (pci_device >> 16);
+      ReportCUDAErrors(
+          cudaDeviceGetAttribute(&attr, cudaDevAttrGpuPciSubsystemId, gpu_));
+      uint32_t pci_subsystem = attr;
+      CERR << "GPU subsystem ID: " << std::hex << (pci_subsystem & 0xffff)
+           << ":" << (pci_subsystem >> 16) << std::dec;
+    }
+#endif
+  }
+#endif
+
+  int threads =
+      opts.GetOrDefault<int>("threads", provider == OnnxProvider::CPU ? 1 : 0);
+  int default_batch = -1;
+  int default_steps = 1;
+  int default_min_batch = 1;
+  switch (provider) {
+    case OnnxProvider::DML:
+    case OnnxProvider::MIGRAPHX:
+      default_batch = 16;
+      default_steps = 4;
+      break;
+    case OnnxProvider::TRT:
+      default_min_batch = 4;
+    default:
+      break;
+  }
+
+  int optimize = opts.GetOrDefault<int>("optimize", 3);
+  batch_size_ = opts.GetOrDefault<int>("batch", default_batch);
+  steps_ = opts.GetOrDefault<int>("steps", default_steps);
+  min_batch_size_ = opts.GetOrDefault<int>("min_batch", default_min_batch);
+
+  // Sanity checks.
+  if (batch_size_ <= 0) {
+    batch_size_ = -1;  // Variable batch size.
+    steps_ = 1;
+  }
+  if (batch_size_ * steps_ > max_batch_size_) {
+    batch_size_ = max_batch_size_ / steps_;
+  }
+
+  const auto& md = file.onnx_model();
+  if (!md.has_input_planes()) {
+    throw Exception("NN doesn't have input planes defined.");
+  }
+  inputs_.emplace_back(md.input_planes());
+  if (!md.has_output_policy()) {
+    throw Exception("NN doesn't have policy head defined.");
+  }
+  policy_head_ = outputs_.size();
+  outputs_.emplace_back(md.output_policy());
+  if (md.has_output_wdl()) {
+    wdl_head_ = outputs_.size();
+    outputs_.emplace_back(md.output_wdl());
+  } else if (md.has_output_value()) {
+    value_head_ = outputs_.size();
+    outputs_.emplace_back(md.output_value());
+  } else {
+    throw Exception("NN doesn't have value head.");
+  }
+  if (md.has_output_mlh()) {
+    mlh_head_ = outputs_.size();
+    outputs_.emplace_back(md.output_mlh());
+  }
+  uint64_t hash = 0;
+  if (provider == OnnxProvider::TRT) {
+    hash = std::hash<std::string_view>()(md.model());
+  }
+  switch (provider) {
+    case OnnxProvider::TRT:
+    case OnnxProvider::CUDA:
+#ifdef USE_ONNX_CUDART
+      ReportCUDAErrors(cudaSetDevice(gpu_));
+      ReportCUDAErrors(cudaStreamCreate(&compute_stream_));
+      ReportCUDAErrors(cudaStreamCreate(&upload_stream_));
+      ReportCUDAErrors(cudaStreamCreate(&download_stream_));
+#else
+      CERR << "WARNING: Simplified version without CUDA enhancements.";
+#endif
+      break;
+    default:
+      break;
+  }
+
+  for (int step = 1; step <= steps_; step++)
+    session_.emplace_back(onnx_env_, file.onnx_model().model().data(),
+                          file.onnx_model().model().size(),
+                          GetOptions(threads, batch_size_ * step, hash, optimize));
+}
+
+template <OnnxProvider kProvider>
+std::unique_ptr<Network> MakeOnnxNetwork(const std::optional<WeightsFile>& w,
+                                         const OptionsDict& opts) {
+  if (!w) throw Exception("The ONNX backend requires a network file.");
+
+  if (w->has_onnx_model()) {
+    return std::make_unique<OnnxNetwork>(*w, opts, kProvider, false);
+  } else {
+    WeightsToOnnxConverterOptions converter_options;
+    converter_options.ir = opts.GetOrDefault<int>("ir", -1);
+    converter_options.alt_mish = opts.GetOrDefault<bool>(
+        "alt_mish", kProvider == OnnxProvider::CPU ? true : false);
+    converter_options.alt_layernorm = opts.GetOrDefault<bool>(
+        "alt_layernorm",
+        kProvider == OnnxProvider::DML &&
+                w->format().network_format().ffn_activation() ==
+                    pblczero::NetworkFormat::ACTIVATION_RELU_2
+            ? true
+            : false);
+    converter_options.no_shape = opts.GetOrDefault<bool>("no_shape", false);
+    converter_options.policy_head =
+        opts.GetOrDefault<std::string>("policy_head", "vanilla");
+    converter_options.value_head =
+        opts.GetOrDefault<std::string>("value_head", "winner");
+    converter_options.no_wdl_softmax = true;
+    // No execution provider has a better mish version, some don't even have it.
+    converter_options.real_mish = false;
+
+    std::string datatype;
+    if (opts.Exists<std::string>("datatype")) {
+      datatype = opts.Get<std::string>("datatype");
+    } else {
+      bool fp16 = opts.GetOrDefault<bool>(
+          "fp16", kProvider == OnnxProvider::CPU ? false : true);
+      datatype = fp16 ? "f16" : "f32";
+    }
+    converter_options.data_type =
+        WeightsToOnnxConverterOptions::StringToDataType(datatype);
+    converter_options.opset = opts.GetOrDefault<int>(
+        "opset", converter_options.data_type ==
+                         WeightsToOnnxConverterOptions::DataType::kBFloat16
+                     ? 22
+                     : 17);
+
+    auto converted = ConvertWeightsToOnnx(*w, converter_options);
+    return std::make_unique<OnnxNetwork>(converted, opts, kProvider, true);
+  }
+}
+
+#ifdef USE_MIGRAPHX
+REGISTER_NETWORK("onnx-migraphx", MakeOnnxNetwork<OnnxProvider::MIGRAPHX>, 65)
+#endif
+#ifdef USE_ROCM
+REGISTER_NETWORK("onnx-rocm", MakeOnnxNetwork<OnnxProvider::ROCM>, 64)
+#endif
+#ifdef USE_DML
+REGISTER_NETWORK("onnx-dml", MakeOnnxNetwork<OnnxProvider::DML>, 63)
+#endif
+REGISTER_NETWORK("onnx-trt", MakeOnnxNetwork<OnnxProvider::TRT>, 60)
+REGISTER_NETWORK("onnx-cuda", MakeOnnxNetwork<OnnxProvider::CUDA>, 61)
+REGISTER_NETWORK("onnx-cpu", MakeOnnxNetwork<OnnxProvider::CPU>, 62)
+
+}  // namespace onnx
+}  // namespace lczero
diff --git a/src/neural/backends/onnx/onnx_kernels.cu b/src/neural/backends/onnx/onnx_kernels.cu
new file mode 100644
index 0000000000..1da1d0f232
--- /dev/null
+++ b/src/neural/backends/onnx/onnx_kernels.cu
@@ -0,0 +1,94 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2025 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include <cstdint>
+
+#include "neural/backends/onnx/onnx_kernels.h"
+#include "utils/exception.h"
+
+namespace lczero {
+namespace onnx {
+
+template <unsigned bits_per_thread, typename DataType>
+__global__ void expandPlanes_kernel(DataType* output, const uint64_t* masks,
+                                    const DataType* values, unsigned n) {
+  unsigned index = threadIdx.x + blockDim.x * blockIdx.x;
+  index *= bits_per_thread;
+  unsigned planeIndex = index >> 6;
+  if (planeIndex >= n) return;
+
+  uint64_t mask = masks[planeIndex];
+  unsigned sqIndex = index & 0x3F;
+  DataType value = static_cast<DataType>(values[planeIndex]);
+  DataType op[bits_per_thread] = {};
+  mask >>= sqIndex;
+  for (unsigned i = 0; i < bits_per_thread; i++) {
+    if (mask & 0x1) {
+      op[i] = value;
+    }
+    mask >>= 1;
+  }
+  for (unsigned i = 0; i < bits_per_thread; i++) {
+    output[index + i] = op[i];
+  }
+}
+
+template <typename DataType>
+void expandPlanesOnnx(DataType* output, const void* input, unsigned n,
+                      cudaStream_t stream) {
+  constexpr unsigned bits_per_thread = 2;
+  int threads = n * 8 * 8 / bits_per_thread;
+  const int blockSize = 256;
+  int blocks = DivUp(threads, blockSize);
+
+  const uint64_t* masks = static_cast<const uint64_t*>(input);
+  const DataType* values = reinterpret_cast<const DataType*>(masks + n);
+
+  expandPlanes_kernel<bits_per_thread>
+      <<<blocks, blockSize, 0, stream>>>(output, masks, values, n);
+
+  ReportCUDAErrors(cudaGetLastError());
+}
+
+void CudaError(cudaError_t status, const char* file, int line) {
+  if (status != cudaSuccess) {
+    auto err = std::string("CUDA error: ") + cudaGetErrorString(status) + " (" +
+               file + ":" + std::to_string(line) + ") ";
+    throw Exception(err);
+  }
+}
+
+template void expandPlanesOnnx<half>(half* output, const void* input,
+                                     unsigned n, cudaStream_t stream);
+template void expandPlanesOnnx<float>(float* output, const void* input,
+                                      unsigned n, cudaStream_t stream);
+template void expandPlanesOnnx<__nv_bfloat16>(__nv_bfloat16* output,
+                                              const void* input, unsigned n,
+                                              cudaStream_t stream);
+
+}  // namespace onnx
+}  // namespace lczero
diff --git a/src/neural/backends/onnx/onnx_kernels.h b/src/neural/backends/onnx/onnx_kernels.h
new file mode 100644
index 0000000000..f16b981da7
--- /dev/null
+++ b/src/neural/backends/onnx/onnx_kernels.h
@@ -0,0 +1,49 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2025 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#pragma once
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+namespace lczero {
+namespace onnx {
+
+// Expand input planes from bitmask to floating point tensors. It is used as a
+// preprocessing step of ONNX models.
+template <typename DataType>
+void expandPlanesOnnx(DataType* output, const void* input, unsigned n,
+                      cudaStream_t stream);
+
+#define ReportCUDAErrors(status) CudaError(status, __FILE__, __LINE__)
+void CudaError(cudaError_t status, const char* file, int line);
+
+inline int DivUp(int a, int b) { return (a + b - 1) / b; }
+
+}  // namespace onnx
+}  // namespace lczero
diff --git a/src/neural/opencl/OpenCL.cc b/src/neural/backends/opencl/OpenCL.cc
similarity index 98%
rename from src/neural/opencl/OpenCL.cc
rename to src/neural/backends/opencl/OpenCL.cc
index 746fcff441..563cef317e 100644
--- a/src/neural/opencl/OpenCL.cc
+++ b/src/neural/backends/opencl/OpenCL.cc
@@ -32,9 +32,9 @@
 #include <string>
 #include <thread>
 
-#include "neural/opencl/OpenCL.h"
-#include "neural/opencl/OpenCLParams.h"
-#include "neural/opencl/OpenCLTuner.h"
+#include "neural/backends/opencl/OpenCL.h"
+#include "neural/backends/opencl/OpenCLParams.h"
+#include "neural/backends/opencl/OpenCLTuner.h"
 #include "utils/logging.h"
 
 static std::string cl_args =
diff --git a/src/neural/opencl/OpenCL.h b/src/neural/backends/opencl/OpenCL.h
similarity index 97%
rename from src/neural/opencl/OpenCL.h
rename to src/neural/backends/opencl/OpenCL.h
index 559f1ef382..08b0324d58 100644
--- a/src/neural/opencl/OpenCL.h
+++ b/src/neural/backends/opencl/OpenCL.h
@@ -36,9 +36,16 @@ using net_t = float;
 #include <string>
 #include <vector>
 
-#include "cl2.hpp"
-#include "neural/opencl/OpenCLBuffers.h"
-#include "neural/opencl/OpenCLParams.h"
+#if __has_include("CL/opencl.hpp")
+#include "CL/opencl.hpp"
+#elif __has_include("OpenCL/opencl.hpp")
+#include "OpenCL/opencl.hpp"
+#else
+#include "opencl.hpp"
+#endif
+
+#include "neural/backends/opencl/OpenCLBuffers.h"
+#include "neural/backends/opencl/OpenCLParams.h"
 
 inline size_t ceilMultiple(size_t a, size_t b) {
   if (a % b == 0) return a;
diff --git a/src/neural/opencl/OpenCLBuffers.cc b/src/neural/backends/opencl/OpenCLBuffers.cc
similarity index 99%
rename from src/neural/opencl/OpenCLBuffers.cc
rename to src/neural/backends/opencl/OpenCLBuffers.cc
index a12f4a4d6a..1331c6caae 100644
--- a/src/neural/opencl/OpenCLBuffers.cc
+++ b/src/neural/backends/opencl/OpenCLBuffers.cc
@@ -19,7 +19,7 @@
   along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-#include "neural/opencl/OpenCLBuffers.h"
+#include "neural/backends/opencl/OpenCLBuffers.h"
 
 OpenCLBuffers::OpenCLBuffers(const OpenCL_Network& opencl_net)
     : m_opencl_net(opencl_net), m_opencl(opencl_net.getOpenCL()) {
@@ -298,7 +298,7 @@ void OpenCLBuffers::forward(const std::vector<net_t>& input,
   auto pinnedOutBufferHost_val = m_commandqueue.enqueueMapBuffer(
       m_pinnedOutBuffer_val, CL_FALSE, CL_MAP_READ, 0,
       batch_size * m_finalSize_val);
-  void* pinnedOutBufferHost_mov;
+  void* pinnedOutBufferHost_mov = nullptr;
   if (m_finalSize_mov > 0) {
     pinnedOutBufferHost_mov = m_commandqueue.enqueueMapBuffer(
         m_pinnedOutBuffer_mov, CL_FALSE, CL_MAP_READ, 0,
diff --git a/src/neural/opencl/OpenCLBuffers.h b/src/neural/backends/opencl/OpenCLBuffers.h
similarity index 96%
rename from src/neural/opencl/OpenCLBuffers.h
rename to src/neural/backends/opencl/OpenCLBuffers.h
index 56d53c4703..3742734b83 100644
--- a/src/neural/opencl/OpenCLBuffers.h
+++ b/src/neural/backends/opencl/OpenCLBuffers.h
@@ -34,9 +34,9 @@
 #include <string>
 #include <thread>
 
-#include "neural/opencl/OpenCL.h"
-#include "neural/opencl/OpenCLParams.h"
-#include "neural/opencl/OpenCLTuner.h"
+#include "neural/backends/opencl/OpenCL.h"
+#include "neural/backends/opencl/OpenCLParams.h"
+#include "neural/backends/opencl/OpenCLTuner.h"
 #include "utils/logging.h"
 
 class OpenCL_Network;
diff --git a/src/neural/opencl/OpenCLParams.h b/src/neural/backends/opencl/OpenCLParams.h
similarity index 100%
rename from src/neural/opencl/OpenCLParams.h
rename to src/neural/backends/opencl/OpenCLParams.h
diff --git a/src/neural/opencl/OpenCLTuner.cc b/src/neural/backends/opencl/OpenCLTuner.cc
similarity index 99%
rename from src/neural/opencl/OpenCLTuner.cc
rename to src/neural/backends/opencl/OpenCLTuner.cc
index 1e8a0c334b..d25dc48f8c 100644
--- a/src/neural/opencl/OpenCLTuner.cc
+++ b/src/neural/backends/opencl/OpenCLTuner.cc
@@ -19,7 +19,7 @@
   along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-#include "neural/opencl/OpenCLTuner.h"
+#include "neural/backends/opencl/OpenCLTuner.h"
 
 #include <array>
 #include <cassert>
@@ -31,8 +31,8 @@
 #include <sstream>
 #include <string>
 
-#include "neural/opencl/OpenCL.h"
-#include "neural/opencl/OpenCLParams.h"
+#include "neural/backends/opencl/OpenCL.h"
+#include "neural/backends/opencl/OpenCLParams.h"
 #include "utils/logging.h"
 
 constexpr auto MAX_ERROR = 1e-4f;
diff --git a/src/neural/opencl/OpenCLTuner.h b/src/neural/backends/opencl/OpenCLTuner.h
similarity index 96%
rename from src/neural/opencl/OpenCLTuner.h
rename to src/neural/backends/opencl/OpenCLTuner.h
index 0c1ed449ed..fec9105030 100644
--- a/src/neural/opencl/OpenCLTuner.h
+++ b/src/neural/backends/opencl/OpenCLTuner.h
@@ -26,8 +26,8 @@
 #include <vector>
 
 #include "OpenCLParams.h"
-#include "neural/opencl/OpenCL.h"
-#include "neural/opencl/OpenCLParams.h"
+#include "neural/backends/opencl/OpenCL.h"
+#include "neural/backends/opencl/OpenCLParams.h"
 
 using Configurations = std::pair<std::string, std::vector<size_t>>;
 using TuneParameters = std::map<std::string, size_t>;
diff --git a/src/neural/opencl/README.md b/src/neural/backends/opencl/README.md
similarity index 100%
rename from src/neural/opencl/README.md
rename to src/neural/backends/opencl/README.md
diff --git a/src/neural/opencl/clblast_level3/common.opencl b/src/neural/backends/opencl/clblast_level3/common.opencl
similarity index 100%
rename from src/neural/opencl/clblast_level3/common.opencl
rename to src/neural/backends/opencl/clblast_level3/common.opencl
diff --git a/src/neural/opencl/clblast_level3/xgemm_batched.opencl b/src/neural/backends/opencl/clblast_level3/xgemm_batched.opencl
similarity index 100%
rename from src/neural/opencl/clblast_level3/xgemm_batched.opencl
rename to src/neural/backends/opencl/clblast_level3/xgemm_batched.opencl
diff --git a/src/neural/opencl/clblast_level3/xgemm_part1.opencl b/src/neural/backends/opencl/clblast_level3/xgemm_part1.opencl
similarity index 100%
rename from src/neural/opencl/clblast_level3/xgemm_part1.opencl
rename to src/neural/backends/opencl/clblast_level3/xgemm_part1.opencl
diff --git a/src/neural/opencl/clblast_level3/xgemm_part2.opencl b/src/neural/backends/opencl/clblast_level3/xgemm_part2.opencl
similarity index 100%
rename from src/neural/opencl/clblast_level3/xgemm_part2.opencl
rename to src/neural/backends/opencl/clblast_level3/xgemm_part2.opencl
diff --git a/src/neural/opencl/clblast_level3/xgemm_part3.opencl b/src/neural/backends/opencl/clblast_level3/xgemm_part3.opencl
similarity index 100%
rename from src/neural/opencl/clblast_level3/xgemm_part3.opencl
rename to src/neural/backends/opencl/clblast_level3/xgemm_part3.opencl
diff --git a/src/neural/opencl/clblast_level3/xgemv.opencl b/src/neural/backends/opencl/clblast_level3/xgemv.opencl
similarity index 100%
rename from src/neural/opencl/clblast_level3/xgemv.opencl
rename to src/neural/backends/opencl/clblast_level3/xgemv.opencl
diff --git a/src/neural/opencl/clsource/config.opencl b/src/neural/backends/opencl/clsource/config.opencl
similarity index 100%
rename from src/neural/opencl/clsource/config.opencl
rename to src/neural/backends/opencl/clsource/config.opencl
diff --git a/src/neural/opencl/clsource/convolve1.opencl b/src/neural/backends/opencl/clsource/convolve1.opencl
similarity index 100%
rename from src/neural/opencl/clsource/convolve1.opencl
rename to src/neural/backends/opencl/clsource/convolve1.opencl
diff --git a/src/neural/opencl/clsource/convolve3.opencl b/src/neural/backends/opencl/clsource/convolve3.opencl
similarity index 100%
rename from src/neural/opencl/clsource/convolve3.opencl
rename to src/neural/backends/opencl/clsource/convolve3.opencl
diff --git a/src/neural/opencl/clsource/policymap.opencl b/src/neural/backends/opencl/clsource/policymap.opencl
similarity index 100%
rename from src/neural/opencl/clsource/policymap.opencl
rename to src/neural/backends/opencl/clsource/policymap.opencl
diff --git a/src/neural/opencl/clsource/se.opencl b/src/neural/backends/opencl/clsource/se.opencl
similarity index 100%
rename from src/neural/opencl/clsource/se.opencl
rename to src/neural/backends/opencl/clsource/se.opencl
diff --git a/src/neural/opencl/network_opencl.cc b/src/neural/backends/opencl/network_opencl.cc
similarity index 98%
rename from src/neural/opencl/network_opencl.cc
rename to src/neural/backends/opencl/network_opencl.cc
index 5d71d06291..b2a6515660 100644
--- a/src/neural/opencl/network_opencl.cc
+++ b/src/neural/backends/opencl/network_opencl.cc
@@ -22,14 +22,14 @@
 #include <condition_variable>
 #include <thread>
 
+#include "neural/backends/opencl/OpenCL.h"
+#include "neural/backends/opencl/OpenCLParams.h"
+#include "neural/backends/shared/activation.h"
+#include "neural/backends/shared/winograd_filter.h"
 #include "neural/factory.h"
 #include "neural/network.h"
 #include "neural/network_legacy.h"
-#include "neural/opencl/OpenCL.h"
-#include "neural/opencl/OpenCLParams.h"
-#include "neural/shared/activation.h"
-#include "neural/shared/policy_map.h"
-#include "neural/shared/winograd_filter.h"
+#include "neural/tables/policy_map.h"
 #include "utils/bititer.h"
 #include "utils/exception.h"
 #include "utils/filesystem.h"
@@ -231,7 +231,7 @@ void OpenCLComputation::EncodePlanes(const InputPlanes& sample, float* buffer) {
 
 class OpenCLNetwork : public Network {
  public:
-  virtual ~OpenCLNetwork(){};
+  virtual ~OpenCLNetwork() {};
 
   OpenCLNetwork(const WeightsFile& file, const OptionsDict& options)
       : capabilities_{file.format().network_format().input(),
@@ -247,15 +247,15 @@ class OpenCLNetwork : public Network {
     params_.tune_only = options.GetOrDefault<bool>("tune_only", false);
     params_.tune_exhaustive =
         options.GetOrDefault<bool>("tune_exhaustive", false);
-    if (options.IsDefault<std::string>("tuner_file")) {
+    if (options.Exists<std::string>("tuner_file")) {
+      params_.tuner_file = options.Get<std::string>("tuner_file");
+    } else {
       std::string user_cache_path = GetUserCacheDirectory();
       if (!user_cache_path.empty()) {
         user_cache_path += "lc0/";
         CreateDirectory(user_cache_path);
       }
       params_.tuner_file = user_cache_path + "leelaz_opencl_tuning";
-    } else {
-      params_.tuner_file = options.Get<std::string>("tuner_file");
     }
 
     wdl_ = file.format().network_format().output() ==
diff --git a/src/neural/shared/activation.cc b/src/neural/backends/shared/activation.cc
similarity index 99%
rename from src/neural/shared/activation.cc
rename to src/neural/backends/shared/activation.cc
index 1710d9e802..6f2551a7e6 100644
--- a/src/neural/shared/activation.cc
+++ b/src/neural/backends/shared/activation.cc
@@ -16,7 +16,7 @@
  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include "neural/shared/activation.h"
+#include "neural/backends/shared/activation.h"
 
 #include <algorithm>
 #include <cmath>
diff --git a/src/neural/shared/activation.h b/src/neural/backends/shared/activation.h
similarity index 72%
rename from src/neural/shared/activation.h
rename to src/neural/backends/shared/activation.h
index 6f110886db..b786d3435d 100644
--- a/src/neural/shared/activation.h
+++ b/src/neural/backends/shared/activation.h
@@ -21,27 +21,9 @@
 #include <cstddef>
 #include <vector>
 
-namespace lczero {
-// The following list matches the one in net.proto. Ideally this would be done
-// by including proto/net.pb.h, but this is incompatible with nvcc.
-enum ActivationFunction {
-  ACTIVATION_DEFAULT = 0,
-  ACTIVATION_MISH = 1,
-  ACTIVATION_RELU = 2,
-  ACTIVATION_NONE = 3,
-  ACTIVATION_TANH = 4,
-  ACTIVATION_SIGMOID = 5,
-  ACTIVATION_SELU = 6,
-  ACTIVATION_SWISH = 7,
-  ACTIVATION_RELU_2 = 8,
-  ACTIVATION_SOFTMAX = 9,
-};
+#include "neural/tables/activation_function.h"
 
-struct Activations {
-    ActivationFunction default_activation = ACTIVATION_RELU;
-    ActivationFunction smolgen_activation = ACTIVATION_SWISH;
-    ActivationFunction ffn_activation = ACTIVATION_RELU_2;
-};
+namespace lczero {
 
 // Softmax activation
 void SoftmaxActivation(const size_t size, const float* input, float* output);
diff --git a/src/neural/shared/activation.ispc b/src/neural/backends/shared/activation.ispc
similarity index 100%
rename from src/neural/shared/activation.ispc
rename to src/neural/backends/shared/activation.ispc
diff --git a/src/neural/shared/winograd_filter.cc b/src/neural/backends/shared/winograd_filter.cc
similarity index 98%
rename from src/neural/shared/winograd_filter.cc
rename to src/neural/backends/shared/winograd_filter.cc
index 3bedd29596..e985565663 100644
--- a/src/neural/shared/winograd_filter.cc
+++ b/src/neural/backends/shared/winograd_filter.cc
@@ -16,7 +16,7 @@
  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include "neural/shared/winograd_filter.h"
+#include "neural/backends/shared/winograd_filter.h"
 
 #include <array>
 
diff --git a/src/neural/shared/winograd_filter.h b/src/neural/backends/shared/winograd_filter.h
similarity index 100%
rename from src/neural/shared/winograd_filter.h
rename to src/neural/backends/shared/winograd_filter.h
diff --git a/src/neural/backends/sycl/common_kernels.dp.cpp b/src/neural/backends/sycl/common_kernels.dp.cpp
new file mode 100644
index 0000000000..8cae7bbf49
--- /dev/null
+++ b/src/neural/backends/sycl/common_kernels.dp.cpp
@@ -0,0 +1,2036 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2018-2024 The LCZero Authors
+  Copyright (C) 2023 Intel Corporation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+   
+  SPDX-License-Identifier:GNU General Public License v3.0 or later
+*/
+
+#include <sycl/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+
+#include "sycl_common.h"
+#include "neural/backends/shared/activation.h"
+#include "neural/tables/attention_policy_map.h"
+#include "winograd_helper.h"
+#include <cmath>
+
+namespace lczero {
+namespace sycldnn_backend {
+namespace {
+constexpr int kInputPlanes = 112;
+}  // namespace
+
+/////////////////////////////////////////////////////////////////////////////
+//          Simple CUDA kernels used by certain layers                     //
+/////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+void addVectors_kernel(T* c, T* a, T* b, int size, int asize,
+                                  int bsize, ActivationFunction activation,
+                                  const sycl::nd_item<3> &item_ct1) {
+  int i = item_ct1.get_local_id(2) +
+          item_ct1.get_local_range(2) * item_ct1.get_group(2);
+  if (i < size) {
+    float aVal = 0;
+    float bVal = 0;
+    if (a) aVal = (float)(a[i % asize]);
+    if (b) bVal = (float)(b[i % bsize]);
+
+    float cVal = aVal + bVal;
+
+    cVal = activate(cVal, activation);
+
+    c[i] = (T)cVal;
+  }
+}
+
+// Adds two vectors (possibly of different sizes), also do optional relu
+// activation.
+template <typename T>
+void addVectors(T* c, T* a, T* b, int size, int asize, int bsize,
+                ActivationFunction activation, sycl::queue &sycl_queue) {
+  const int kBlockSize = 256;
+  int blocks = DivUp(size, kBlockSize);
+
+  sycl_queue.parallel_for(
+      sycl::nd_range<3>(
+          sycl::range<3>(1, 1, blocks) * sycl::range<3>(1, 1, kBlockSize),
+          sycl::range<3>(1, 1, kBlockSize)),
+      [=](sycl::nd_item<3> item_ct1) {
+        addVectors_kernel(c, a, b, size, asize, bsize, activation, item_ct1);
+      });
+}
+
+template <typename T>
+void addVectorsHNC_NHC_kernel(T* a, T* b, int N, int H, int C,
+                              const sycl::nd_item<3> &item_ct1) {
+  int i = item_ct1.get_local_id(2) +
+          item_ct1.get_local_range(2) * item_ct1.get_group(2);
+  if (i < N * H * C) {
+    int orig_i = i;
+    int c = i % C;
+    i /= C;
+    int n = i % N;
+    i /= N;
+    int h = i;
+    float aVal = (float)a[orig_i];
+    float bVal = (float)b[n * H * C + h * C + c];
+
+    float cVal = aVal + bVal;
+
+    a[orig_i] = (T)cVal;
+  }
+}
+
+template <typename T>
+void addVectorsHNC_NHC(T* a, T* b, int N, int H, int C,
+                       sycl::queue &sycl_queue) {
+  const int kBlockSize = 256;
+  int blocks = DivUp(N * H * C, kBlockSize);
+  sycl_queue.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, blocks) *
+                                             sycl::range<3>(1, 1, kBlockSize),
+                                         sycl::range<3>(1, 1, kBlockSize)),
+                       [=](sycl::nd_item<3> item_ct1) {
+                         addVectorsHNC_NHC_kernel(a, b, N, H, C, item_ct1);
+                       });
+}
+
+template <typename T, ActivationFunction act>
+void addBiasBatched_kernel(T* output, const T* input, const T* bias,
+                                      int N, int C,
+                                      const sycl::nd_item<3> &item_ct1) {
+  int batch = item_ct1.get_group(1);
+  int n = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+          item_ct1.get_local_id(1);
+  if (n >= N) return;
+  int c = item_ct1.get_local_id(2) * 4;
+
+  int biasIndex = batch * C + c;
+  int tensorIndex = batch * N * C + n * C + c;
+
+  float val[4];
+  float b[4];
+
+  // Load from memory
+  const bool fp16 = std::is_same<sycl::half, T>::value;
+  if (fp16) {
+    sycl::half inp[4];
+    copyAs<sycl::uint2>(&inp[0], &input[tensorIndex]);
+#pragma unroll
+    for (int i = 0; i < 4; i++) val[i] = (float)inp[i];
+
+    copyAs<sycl::uint2>(&inp[0], &bias[biasIndex]);
+#pragma unroll
+    for (int i = 0; i < 4; i++) b[i] = (float)inp[i];
+  } else {
+    copyAs<sycl::uint4>(&val[0], &input[tensorIndex]);
+    copyAs<sycl::uint4>(&b[0], &bias[biasIndex]);
+  }
+
+  // Perform bias add and activation
+#pragma unroll
+  for (int i = 0; i < 4; i++) {
+    float x = val[i] + b[i];
+    x = activate(x, act);
+    val[i] = x;
+  }
+
+  // write to memory
+  if (fp16) {
+    sycl::half op[4];
+#pragma unroll
+    for (int i = 0; i < 4; i++) op[i] = (sycl::half)val[i];
+    copyAs<sycl::uint2>(&output[tensorIndex], &op[0]);
+  } else {
+    copyAs<sycl::uint4>(&output[tensorIndex], &val[0]);
+  }
+}
+
+// Input/output tensors are Batch * N * C
+// bias tensor is N * C (i.e, different bias for each Batch dimension)
+template <typename T>
+void addBiasBatched(T* output, const T* input, const T* bias, int Batch, int N,
+                    int C, ActivationFunction activation, sycl::queue &sycl_queue) {
+  // process 4 elements per thread to achieve close to peak memory bandwidth
+  if (C % 4 != 0) throw Exception("unsupported filter size");
+  if (C > 2048) throw Exception("unsupported filter size");
+
+  sycl::range<3> blockDim(1, 1, 1), gridDim(1, 1, 1);
+  blockDim[2] = C / 4;
+  unsigned int tmp = (512 / blockDim[2]);
+  blockDim[1] = sycl::min(sycl::max(tmp, 1u), (unsigned int)N);
+  blockDim[0] = 1;
+  gridDim[2] = DivUp(N, blockDim[1]);
+  gridDim[1] = Batch;
+  gridDim[0] = 1;
+
+  switch (activation) {
+    case ACTIVATION_NONE:
+      //addBiasBatched_kernel<T, ACTIVATION_NONE>
+        //  <<<gridDim, blockDim, 0, stream>>>(output, input, bias, N, C);
+        sycl_queue.parallel_for(sycl::nd_range<3>(gridDim * blockDim, blockDim),
+                           [=](sycl::nd_item<3> item_ct1) {
+                             addBiasBatched_kernel<T, ACTIVATION_NONE>(output, input, bias,
+                                                            N, C, item_ct1);
+                           });
+      break;
+    case ACTIVATION_SELU:
+      //addBiasBatched_kernel<T, ACTIVATION_SELU>
+        //  <<<gridDim, blockDim, 0, stream>>>(output, input, bias, N, C);
+
+        sycl_queue.parallel_for(sycl::nd_range<3>(gridDim * blockDim, blockDim),
+                           [=](sycl::nd_item<3> item_ct1) {
+                             addBiasBatched_kernel<T, ACTIVATION_SELU>(output, input, bias,
+                                                            N, C, item_ct1);
+                           });
+
+      break;
+    case ACTIVATION_MISH:
+      //addBiasBatched_kernel<T, ACTIVATION_MISH>
+        //  <<<gridDim, blockDim, 0, stream>>>(output, input, bias, N, C);
+
+      sycl_queue.parallel_for(sycl::nd_range<3>(gridDim * blockDim, blockDim),
+                           [=](sycl::nd_item<3> item_ct1) {
+                             addBiasBatched_kernel<T, ACTIVATION_MISH>(output, input, bias,
+                                                            N, C, item_ct1);
+                           });
+      break;
+    case ACTIVATION_RELU:
+      //addBiasBatched_kernel<T, ACTIVATION_RELU>
+        //  <<<gridDim, blockDim, 0, stream>>>(output, input, bias, N, C);
+      sycl_queue.parallel_for(sycl::nd_range<3>(gridDim * blockDim, blockDim),
+                           [=](sycl::nd_item<3> item_ct1) {
+                             addBiasBatched_kernel<T, ACTIVATION_RELU>(output, input, bias,
+                                                            N, C, item_ct1);
+                           });
+      break;
+    case ACTIVATION_SWISH:
+      //addBiasBatched_kernel<T, ACTIVATION_SWISH>
+        //  <<<gridDim, blockDim, 0, stream>>>(output, input, bias, N, C);
+      
+      sycl_queue.parallel_for(sycl::nd_range<3>(gridDim * blockDim, blockDim),
+                           [=](sycl::nd_item<3> item_ct1) {
+                             addBiasBatched_kernel<T, ACTIVATION_SWISH>(output, input, bias,
+                                                            N, C, item_ct1);
+                           });
+      break;
+    case ACTIVATION_RELU_2:  // square relu
+      //addBiasBatched_kernel<T, ACTIVATION_RELU_2>
+        //  <<<gridDim, blockDim, 0, stream>>>(output, input, bias, N, C);
+      
+      sycl_queue.parallel_for(sycl::nd_range<3>(gridDim * blockDim, blockDim),
+                           [=](sycl::nd_item<3> item_ct1) {
+                             addBiasBatched_kernel<T, ACTIVATION_RELU_2>(output, input, bias,
+                                                            N, C, item_ct1);
+                           });
+
+      break;
+    default:
+      throw Exception(
+          "unsupported activation in addBiasBatched. Add in switch-case here");
+  }
+}
+
+template <typename T, ActivationFunction act>
+void addBiasBatched_kernel(T* output, const T* input, const T* bias,
+                                      int N, int C, int Nstride,
+                                      const sycl::nd_item<3> &item_ct1) {
+  int batch = item_ct1.get_group(1);
+  int n = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+          item_ct1.get_local_id(1);
+  if (n >= N) return;
+  int c = item_ct1.get_local_id(2) * 4;
+
+  int biasIndex = batch * C + c;
+  int tensorIndex = batch * Nstride * C + n * C + c;
+
+  float val[4];
+  float b[4];
+
+  // Load from memory
+  const bool fp16 = std::is_same<sycl::half, T>::value;
+  if (fp16) {
+    sycl::half inp[4];
+    copyAs<sycl::uint2>(&inp[0], &input[tensorIndex]);
+#pragma unroll
+    for (int i = 0; i < 4; i++) val[i] = (float)inp[i];
+
+    copyAs<sycl::uint2>(&inp[0], &bias[biasIndex]);
+#pragma unroll
+    for (int i = 0; i < 4; i++) b[i] = (float)inp[i];
+  } else {
+    copyAs<sycl::uint4>(&val[0], &input[tensorIndex]);
+    copyAs<sycl::uint4>(&b[0], &bias[biasIndex]);
+  }
+
+  // Perform bias add and activation
+#pragma unroll
+  for (int i = 0; i < 4; i++) {
+    float x = val[i] + b[i];
+    x = activate(x, act);
+    val[i] = x;
+  }
+
+  // write to memory
+  if (fp16) {
+    sycl::half op[4];
+#pragma unroll
+    for (int i = 0; i < 4; i++) op[i] = (sycl::half)val[i];
+    copyAs<sycl::uint2>(&output[tensorIndex], &op[0]);
+  } else {
+    copyAs<sycl::uint4>(&output[tensorIndex], &val[0]);
+  }
+}
+
+// Input/output tensors are Batch * N * C
+// bias tensor is N * C (i.e, different bias for each Batch dimension)
+template <typename T>
+void addBiasBatched(T* output, const T* input, const T* bias, int Batch, int N,
+                    int C, int Nstride, ActivationFunction activation, sycl::queue &sycl_queue) {
+  // process 4 elements per thread to achieve close to peak memory bandwidth
+  if (C % 4 != 0) throw Exception("unsupported filter size");
+  if (C > 4096) throw Exception("unsupported filter size");
+
+  sycl::range<3> blockDim(1, 1, 1), gridDim(1, 1, 1);
+  blockDim[2] = C / 4;
+  unsigned int tmp = (512 / blockDim[2]);
+  blockDim[1] = sycl::min(sycl::max(tmp, 1u), (unsigned int)N);
+  blockDim[0] = 1;
+  gridDim[2] = DivUp(N, blockDim[1]);
+  gridDim[1] = Batch;
+  gridDim[0] = 1;
+
+  switch (activation) {
+    case ACTIVATION_NONE:
+      //addBiasBatched_kernel<T, ACTIVATION_NONE>
+      //    <<<gridDim, blockDim, 0, stream>>>(output, input, bias, N, C,
+       //                                      Nstride);
+       sycl_queue.parallel_for(sycl::nd_range<3>(gridDim * blockDim, blockDim),
+                           [=](sycl::nd_item<3> item_ct1) {
+                             addBiasBatched_kernel<T, ACTIVATION_NONE>(output, input, bias,
+                                                            N, C, Nstride, item_ct1);
+                           });
+      break;
+    case ACTIVATION_SELU:
+     // addBiasBatched_kernel<T, ACTIVATION_SELU>
+       //   <<<gridDim, blockDim, 0, stream>>>(output, input, bias, N, C,
+         //                                    Nstride);
+          sycl_queue.parallel_for(sycl::nd_range<3>(gridDim * blockDim, blockDim),
+                           [=](sycl::nd_item<3> item_ct1) {
+                             addBiasBatched_kernel<T, ACTIVATION_SELU>(output, input, bias,
+                                                            N, C, Nstride, item_ct1);
+                            });
+      break;
+    case ACTIVATION_MISH:
+      //addBiasBatched_kernel<T, ACTIVATION_MISH>
+      //    <<<gridDim, blockDim, 0, stream>>>(output, input, bias, N, C,
+       //                                      Nstride);
+      sycl_queue.parallel_for(sycl::nd_range<3>(gridDim * blockDim, blockDim),
+                           [=](sycl::nd_item<3> item_ct1) {
+                             addBiasBatched_kernel<T, ACTIVATION_MISH>(output, input, bias,
+                                                            N, C, Nstride, item_ct1);
+                           });
+
+      break;
+    case ACTIVATION_RELU:
+      //addBiasBatched_kernel<T, ACTIVATION_RELU>
+        //  <<<gridDim, blockDim, 0, stream>>>(output, input, bias, N, C,
+          //                                   Nstride);
+
+      sycl_queue.parallel_for(sycl::nd_range<3>(gridDim * blockDim, blockDim),
+                           [=](sycl::nd_item<3> item_ct1) {
+                             addBiasBatched_kernel<T, ACTIVATION_RELU>(output, input, bias,
+                                                            N, C, Nstride, item_ct1);
+                            });
+      break;
+    case ACTIVATION_SWISH:
+      //addBiasBatched_kernel<T, ACTIVATION_SWISH>
+      //    <<<gridDim, blockDim, 0, stream>>>(output, input, bias, N, C,
+      //                                       Nstride);
+
+       sycl_queue.parallel_for(sycl::nd_range<3>(gridDim * blockDim, blockDim),
+                           [=](sycl::nd_item<3> item_ct1) {
+                             addBiasBatched_kernel<T, ACTIVATION_SWISH>(output, input, bias,
+                                                            N, C, Nstride, item_ct1);
+                          });
+      break;
+    case ACTIVATION_RELU_2:  // square relu
+     // addBiasBatched_kernel<T, ACTIVATION_RELU_2>
+     //     <<<gridDim, blockDim, 0, stream>>>(output, input, bias, N, C,
+      //                                       Nstride);
+      sycl_queue.parallel_for(sycl::nd_range<3>(gridDim * blockDim, blockDim),
+                           [=](sycl::nd_item<3> item_ct1) {
+                             addBiasBatched_kernel<T, ACTIVATION_RELU_2>(output, input, bias,
+                                                            N, C, Nstride, item_ct1);
+                            });
+
+      break;
+    default:
+      throw Exception(
+          "unsupported activation in addBiasBatched. Add in switch-case here");
+  }
+}
+
+template <typename T>
+void addBias_NCHW_kernel(T* c, T* a, T* b, int N, int C, int H,
+                                    int W, ActivationFunction activation,
+                                    const sycl::nd_item<3> &item_ct1) {
+  int i = item_ct1.get_local_id(2) +
+          item_ct1.get_local_range(2) * item_ct1.get_group(2);
+  int size = N * C * H * W;
+
+  if (i < size) {
+    float aVal = (float)a[i];
+
+    // All this math can be optimized, but the kernel is memory bound anyway.
+    int biasIndex = (i / (H * W)) % C;
+    float bVal = (float)b[biasIndex];
+    
+
+    float cVal = aVal + bVal;
+
+    cVal = activate(cVal, activation);
+
+    c[i] = (T)cVal;
+  }
+}
+
+// Add bias to convolution's output.
+template <typename T>
+void addBias_NCHW(T* c, T* a, T* b, int N, int C, int H, int W,
+                  ActivationFunction activation, sycl::queue &sycl_queue) {
+  int size = N * C * H * W;
+  const int kBlockSize = 256;
+  int blocks = DivUp(size, kBlockSize);
+
+  sycl_queue.parallel_for(
+      sycl::nd_range<3>(
+          sycl::range<3>(1, 1, blocks) * sycl::range<3>(1, 1, kBlockSize),
+          sycl::range<3>(1, 1, kBlockSize)),
+      [=](sycl::nd_item<3> item_ct1) {
+        addBias_NCHW_kernel(c, a, b, N, C, H, W, activation, item_ct1);
+      });
+}
+
+template <typename dT, typename sT>
+dT readNCHW(const sT* input_tensor, int n, int c, int h, int w,
+                       int Nin, int Cin, int H, int W) {
+  if (n >= Nin || c >= Cin) return 0;
+
+  int index;
+  index = n;
+  index *= Cin;
+  index += c;
+  index *= H;
+  index += h;
+  index *= W;
+  index += w;
+
+  return (dT)(input_tensor[index]);
+}
+
+template <typename dT, typename sT>
+void NCHWtoNHWC_kernel(dT* output_tensor, const sT* input_tensor,
+                                  int Nin, int Cin, int Nout, int Cout, int H,
+                                  int W, const sycl::nd_item<3> &item_ct1) {
+  int tid = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
+            item_ct1.get_local_id(2);
+
+  if (tid >= Nout * Cout * H * W) return;
+
+  int index = tid;
+
+  int c = (index % Cout);
+  index /= Cout;
+  int w = index % W;
+  index /= W;
+  int h = index % H;
+  index /= H;
+  int n = index;
+
+  output_tensor[tid] =
+      readNCHW<dT, sT>(input_tensor, n, c, h, w, Nin, Cin, H, W);
+}
+
+template <typename DstType, typename SrcType>
+void convertNCHWtoNHWC(DstType* output_tensor, const SrcType* input_tensor,
+                       int Nin, int Cin, int Nout, int Cout, int H, int W, sycl::queue &sycl_queue) {
+  size_t numElements = Nout * Cout * H * W;
+  const int blockSize = 256;
+  int blocks = DivUp(numElements, blockSize);
+  sycl_queue.parallel_for(
+      sycl::nd_range<3>(
+          sycl::range<3>(1, 1, blocks) * sycl::range<3>(1, 1, blockSize),
+          sycl::range<3>(1, 1, blockSize)),
+      [=](sycl::nd_item<3> item_ct1) {
+        NCHWtoNHWC_kernel(output_tensor, input_tensor, Nin, Cin, Nout, Cout, H,
+                          W, item_ct1);
+      });
+}
+
+template <typename DstType, typename SrcType>
+void copyTypeConverted_kernel(DstType* op, SrcType* ip, int N,
+                              const sycl::nd_item<3> &item_ct1) {
+  int tid = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
+            item_ct1.get_local_id(2);
+
+  if (tid >= N) return;
+
+  DstType el = (DstType)ip[tid];
+  op[tid] = el;
+}
+
+template <typename DstType, typename SrcType>
+void copyTypeConverted(DstType* op, SrcType* ip, int N, sycl::queue &sycl_queue) {
+  const int kBlockSize = 256;
+  int blocks = DivUp(N, kBlockSize);
+  sycl_queue.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, blocks) *
+                                             sycl::range<3>(1, 1, kBlockSize),
+                                         sycl::range<3>(1, 1, kBlockSize)),
+                       [=](sycl::nd_item<3> item_ct1) {
+                         copyTypeConverted_kernel(op, ip, N, item_ct1);
+                       });
+}
+
+template <typename T>
+void batchNorm_kernel(T* output, const T* input, const T* skipInput,
+                                 int N, int C, int H, int W, const float* means,
+                                 const float* varMultipliers,
+                                 ActivationFunction activation,
+                                 const sycl::nd_item<3> &item_ct1) {
+  int index = item_ct1.get_local_id(2) +
+              item_ct1.get_local_range(2) * item_ct1.get_group(2);
+
+  int wIndex = 0;
+  if (sizeof(T) == sizeof(float))
+    wIndex = (index / (H * W)) % C;  // NCHW for fp32.
+  else
+    wIndex = index % C;  // NHWC for fp16.
+
+  float el = input[index];
+  float mean = means[wIndex];
+  float varMulti = varMultipliers[wIndex];
+
+  el -= mean;
+  el *= varMulti;
+
+  if (skipInput) el += (float)skipInput[index];
+
+  el = activate(el, activation);
+
+  output[index] = (T)el;
+}
+
+// Every thread processes single element.
+template <typename T>
+void batchNorm(T* output, const T* input, const T* skipInput, int N, int C,
+               int H, int W, float* means, float* var_multipliers,
+               ActivationFunction activation, sycl::queue &sycl_queue) {
+  const int total_elements = N * C * H * W;
+  const int kBlockSize = 256;
+  int blocks = DivUp(total_elements, kBlockSize);
+
+  sycl_queue.parallel_for(
+      sycl::nd_range<3>(
+          sycl::range<3>(1, 1, blocks) * sycl::range<3>(1, 1, kBlockSize),
+          sycl::range<3>(1, 1, kBlockSize)),
+      [=](sycl::nd_item<3> item_ct1) {
+        batchNorm_kernel(output, input, skipInput, N, C, H, W, means,
+                         var_multipliers, activation, item_ct1);
+      });
+}
+
+void expandPlanes_kernel_Fp32_NCHW(float* output,
+                                              const uint64_t* masks,
+                                              const float* values, int n,
+                                              const sycl::nd_item<3> &item_ct1,
+                                              uint64_t *shMasks, float *shVals) {
+  // Block size of 256, same mask/val for 64 consecutive threads.
+  constexpr int kNumShmemElements = 256 / 64;
+
+  int index = item_ct1.get_local_id(2) +
+              item_ct1.get_local_range(2) * item_ct1.get_group(2);
+
+  int planeIndex = index >> 6;
+
+  if (planeIndex >= n) return;
+
+  // Load inputs to shared memory.
+  if (item_ct1.get_local_id(2) < kNumShmemElements) {
+    shMasks[item_ct1.get_local_id(2)] =
+        masks[planeIndex + item_ct1.get_local_id(2)];
+    shVals[item_ct1.get_local_id(2)] =
+        values[planeIndex + item_ct1.get_local_id(2)];
+  }
+  /*
+  DPCT1113:53: Consider replacing
+  sycl::nd_item::barrier(sycl::access::fence_space::local_space) with
+  sycl::nd_item::barrier() if function "expandPlanes_kernel_Fp32_NCHW" is called
+  in a multidimensional kernel.
+  */
+  item_ct1.barrier(sycl::access::fence_space::local_space);
+
+  uint64_t mask = shMasks[item_ct1.get_local_id(2) >> 6];
+
+  int sqIndex = index & 0x3F;
+  float op = 0;
+
+  bool set = !!(mask & (1ull << sqIndex));
+  if (set) {
+    op = shVals[item_ct1.get_local_id(2) >> 6];
+  }
+  output[index] = op;
+}
+
+void expandPlanes_Fp32_NCHW(float* output, const uint64_t* masks,
+                            const float* values, int n, sycl::queue &sycl_queue) {
+  int threads = n * 8 * 8;  // Each thread writes a single element.
+  const int blockSize = 256;
+  int blocks = DivUp(threads, blockSize);
+  
+  sycl_queue.submit([&](sycl::handler& cgh) {
+    /*
+    DPCT1101:115: 'kNumShmemElements' expression was replaced with a value.
+    Modify the code to use the original expression, provided in comments, if
+    it is correct.
+    */
+    sycl::local_accessor<uint64_t, 1> shMasks_acc_ct1(
+        sycl::range<1>(4 /*kNumShmemElements*/), cgh);
+    /*
+    DPCT1101:116: 'kNumShmemElements' expression was replaced with a value.
+    Modify the code to use the original expression, provided in comments, if
+    it is correct.
+    */
+    sycl::local_accessor<float, 1> shVals_acc_ct1(
+        sycl::range<1>(4 /*kNumShmemElements*/), cgh);
+
+    cgh.parallel_for(
+        sycl::nd_range<3>(
+            sycl::range<3>(1, 1, blocks) * sycl::range<3>(1, 1, blockSize),
+            sycl::range<3>(1, 1, blockSize)),
+        [=](sycl::nd_item<3> item_ct1) {
+          expandPlanes_kernel_Fp32_NCHW(output, masks, values, n, item_ct1,
+                                        shMasks_acc_ct1.get_pointer(),
+                                        shVals_acc_ct1.get_pointer());
+        });
+  });
+}
+
+// TODO: Can optimize using shared memory if this becomes a bottleneck.
+void expandPlanes_kernel_Fp16_NHWC(sycl::half* output, const uint64_t* masks,
+                                   const float* values, int n,
+                                   const sycl::nd_item<3>& item_ct1) {
+  const int index = item_ct1.get_local_id(2) +
+                    item_ct1.get_local_range(2) * item_ct1.get_group(2);
+  if (index >= n * 8 * 8) return;
+
+  const int planeIndex = index % kInputPlanes;
+  const int boardIndex = index / (kInputPlanes * 8 * 8);
+  const int sqIndex = (index / kInputPlanes) & 0x3F;
+
+  uint64_t mask = masks[boardIndex * kInputPlanes + planeIndex];
+
+  sycl::half op = 0;
+  bool set = !!(mask & (1ull << sqIndex));
+  if (set) {
+    float val = values[boardIndex * kInputPlanes + planeIndex];
+    op = (sycl::half)val;
+  }
+  output[index] = op;
+}
+
+void expandPlanes_Fp16_NHWC(sycl::half* output, const uint64_t* masks,
+                            const float* values, int n, sycl::queue &sycl_queue) {
+  int threads = n * 8 * 8;  // Each thread writes a single element.
+  const int kBlockSize = 256;
+  int blocks = DivUp(threads, kBlockSize);
+  {
+    
+    sycl_queue.parallel_for(
+        sycl::nd_range<3>(
+            sycl::range<3>(1, 1, blocks) * sycl::range<3>(1, 1, kBlockSize),
+            sycl::range<3>(1, 1, kBlockSize)),
+        [=](sycl::nd_item<3> item_ct1) {
+          expandPlanes_kernel_Fp16_NHWC(output, masks, values, n, item_ct1);
+        });
+  }
+}
+
+void expandPlanes_kernel_Fp16_NCHW(sycl::half* output, const uint64_t* masks,
+                                   const float* values, int n,
+                                   const sycl::nd_item<3>& item_ct1,
+                                   uint64_t* shMasks, sycl::half* shVals) {
+  // block size of 256, same mask/val for 64 consecutive threads
+  constexpr int kNumShmemElements = 256 / 64;
+
+  int index = item_ct1.get_local_id(2) +
+              item_ct1.get_local_range(2) * item_ct1.get_group(2);
+
+  int planeIndex = index >> 6;
+
+  if (planeIndex >= n) return;
+
+  // load inputs to shared memory
+  if (item_ct1.get_local_id(2) < kNumShmemElements) {
+    shMasks[item_ct1.get_local_id(2)] =
+        masks[planeIndex + item_ct1.get_local_id(2)];
+    shVals[item_ct1.get_local_id(2)] =
+        values[planeIndex + item_ct1.get_local_id(2)];
+  }
+  /*
+  DPCT1113:56: Consider replacing
+  sycl::nd_item::barrier(sycl::access::fence_space::local_space) with
+  sycl::nd_item::barrier() if function "expandPlanes_kernel_Fp16_NCHW" is called
+  in a multidimensional kernel.
+  */
+  item_ct1.barrier(sycl::access::fence_space::local_space);
+
+  uint64_t mask = shMasks[item_ct1.get_local_id(2) >> 6];
+
+  int sqIndex = index & 0x3F;
+  sycl::half op = 0;
+
+  bool set = !!(mask & (1ull << sqIndex));
+  if (set) {
+    op = (sycl::half)shVals[item_ct1.get_local_id(2) >> 6];
+  }
+  output[index] = op;
+}
+
+void expandPlanes_Fp16_NCHW(sycl::half* output, const uint64_t* masks,
+                            const float* values, int n, sycl::queue &sycl_queue) {
+  int threads = n * 8 * 8;  // each thread writes a single element
+  const int blockSize = 256;
+  int blocks = DivUp(threads, blockSize);
+  {
+    
+    sycl_queue.submit([&](sycl::handler& cgh) {
+      /*
+      DPCT1101:117: 'kNumShmemElements' expression was replaced with a value.
+      Modify the code to use the original expression, provided in comments, if
+      it is correct.
+      */
+      sycl::local_accessor<uint64_t, 1> shMasks_acc_ct1(
+          sycl::range<1>(4 /*kNumShmemElements*/), cgh);
+      /*
+      DPCT1101:118: 'kNumShmemElements' expression was replaced with a value.
+      Modify the code to use the original expression, provided in comments, if
+      it is correct.
+      */
+      sycl::local_accessor<sycl::half, 1> shVals_acc_ct1(
+          sycl::range<1>(4 /*kNumShmemElements*/), cgh);
+
+      cgh.parallel_for(
+          sycl::nd_range<3>(
+              sycl::range<3>(1, 1, blocks) * sycl::range<3>(1, 1, blockSize),
+              sycl::range<3>(1, 1, blockSize)),
+          [=](sycl::nd_item<3> item_ct1) {
+            expandPlanes_kernel_Fp16_NCHW(output, masks, values, n, item_ct1,
+                                          shMasks_acc_ct1.get_pointer(),
+                                          shVals_acc_ct1.get_pointer());
+          });
+    });
+  }
+}
+
+template <typename T>
+void globalScale_kernel(T* output, const T* input,
+                                   const T* scaleBias, const T* prevLayerBias,
+                                   int inputSize, int C,
+                                   ActivationFunction activation,
+                                   const sycl::nd_item<3> &item_ct1) {
+  const int kPlaneSize = 64;
+
+  int tid = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
+            item_ct1.get_local_id(2);
+
+  if (tid > inputSize) return;
+
+  int nc = tid / kPlaneSize;
+  int n = nc / C;
+  int c = nc % C;
+
+  float val1 = input[tid];   // Output of residual block to be scaled.
+  float val2 = output[tid];  // Skip connection to be added directly.
+
+  if (prevLayerBias) {
+    val1 += (float)(prevLayerBias[c]);
+  }
+
+  int startIdx = n * 2 * C;  // Scale and bias interleaved.
+
+  float s = scaleBias[startIdx + c];
+  s = 1.0f / (1.0f + sycl::exp(-s));  // Sigmoid on scale.
+
+  float b = scaleBias[startIdx + c + C];
+
+  float op = val1 * s + val2 + b;
+  op = activate(op, activation);
+  output[tid] = (T)op;
+}
+
+void globalScale_kernel_fp16_nhwc(sycl::half* output, const sycl::half* input,
+                                  const sycl::half* scaleBias,
+                                  const sycl::half* prevLayerBias,
+                                  int inputSize, int C, int HWC,
+                                  ActivationFunction activation,
+                                  const sycl::nd_item<3>& item_ct1) {
+  int tid = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
+            item_ct1.get_local_id(2);
+
+  if (tid > inputSize) return;
+
+  int c = tid % C;
+  int n = tid / (HWC);
+
+  float val1 = (float)input[tid];   // Output of residual block to be scaled.
+  float val2 = (float)output[tid];  // Skip connection to be added directly.
+  if (prevLayerBias) {
+    val1 += (float)prevLayerBias[c];
+  }
+
+  int startIdx = n * 2 * C;  // Scale and bias interleaved.
+
+  float s = scaleBias[startIdx + c];
+  s = 1.0f / (1.0f + sycl::exp(-s));  // Sigmoid on scale.
+
+  float b = scaleBias[startIdx + c + C];
+
+  float op = val1 * s + val2 + b;
+  op = activate(op, activation);
+
+  output[tid] = (sycl::half)op;
+}
+
+// N blocks.
+// C threads per block.
+// 'HWC' input data processed by thread block.
+// Each thread writes a single output.
+void globalAvgPool_kernel_NHWC_fp16(sycl::half* output, const sycl::half* input,
+                                    const sycl::half* prevLayerBias,
+                                    int inputSize, int outputSize,
+                                    const sycl::nd_item<3>& item_ct1) {
+  const int elementsPerThread = 64;  // 8x8 board.
+
+  int blockStart = item_ct1.get_group(2) * item_ct1.get_local_range(2);
+
+  float S = 0;
+
+#pragma unroll
+  for (int i = 0; i < elementsPerThread; i++) {
+    int localIndex = i * item_ct1.get_local_range(2) + item_ct1.get_local_id(2);
+    int inputIndex = blockStart * elementsPerThread + localIndex;
+    if (inputIndex < inputSize) S += (float)(input[inputIndex]);
+  }
+
+  float avg = S / elementsPerThread;
+
+  // Add bias from previous layer.
+  if (prevLayerBias) avg += (float)(prevLayerBias[item_ct1.get_local_id(2)]);
+
+  int opIndex = blockStart + item_ct1.get_local_id(2);
+  if (opIndex < outputSize) output[opIndex] = (sycl::half)avg;
+}
+
+// Each thread reads 2 inputs (8x8/32), and each warp writes a single output.
+template <typename T>
+void globalAvgPool_kernel(T* output, const T* input,
+                                     const T* prevLayerBias, int inputSize,
+                                     int outputSize, int C,
+                                     const sycl::nd_item<3> &item_ct1) {
+  const int elementsPerWarp = 64;
+  const int elementsPerThread = 2;
+
+  int tid = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
+            item_ct1.get_local_id(2);
+
+  int laneId = item_ct1.get_local_id(2) & 0x1F;
+  int laneStartIndex = (tid - laneId) * elementsPerThread;
+
+  // Compute per-thread sum for elementsPerThread elements.
+  float S = 0;
+
+#pragma unroll
+  for (int i = 0; i < elementsPerWarp; i += 32) {
+    int index = laneStartIndex + laneId + i;
+    if (index < inputSize) S += (float)(input[index]);
+  }
+
+// Compute warp wide sum (for entire plane - elementsPerWarp elements).
+#pragma unroll
+  for (int offset = 1; offset < 32; offset *= 2) {
+    /*
+    DPCT1023:10: The SYCL sub-group does not support mask options for
+    dpct::shift_sub_group_left. You can specify
+    "--use-experimental-features=masked-sub-group-operation" to use the
+    experimental helper function to migrate __shfl_down_sync.
+    */
+    S += sycl::shift_group_left(item_ct1.get_sub_group(), S, offset);
+  }
+
+  float avg = S / elementsPerWarp;
+  int opIndex = tid >> 5;
+
+  // First thread in warp has the sum, write it in output.
+  if (laneId == 0) {
+    if (opIndex < outputSize) {
+      if (prevLayerBias) avg += (float)prevLayerBias[opIndex % C];
+      output[opIndex] = (T)avg;
+    }
+  }
+}
+
+template <typename T>
+void globalAvgPool(int N, int C, T* output, const T* input,
+                   const T* prevLayerBias, bool nhwc, sycl::queue &sycl_queue) {
+  const int kPlaneSize = 64;
+
+  const bool fp16 = std::is_same<sycl::half, T>::value;
+  if (nhwc) {
+    assert(fp16);
+    // For NHWC fp16, simply launch N blocks, each with C threads.
+    /*
+    DPCT1049:11: The work-group size passed to the SYCL kernel may exceed the
+    limit. To get the device limit, query info::device::max_work_group_size.
+    Adjust the work-group size if needed.
+    */
+    {
+      
+      sycl_queue.parallel_for(
+          sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
+                            sycl::range<3>(1, 1, C)),
+          [=](sycl::nd_item<3> item_ct1) {
+            globalAvgPool_kernel_NHWC_fp16((sycl::half*)output,
+                                           (sycl::half*)input,
+                                           (sycl::half*)prevLayerBias,
+                                           N * C * kPlaneSize, N * C, item_ct1);
+          });
+    }
+  } else {
+    // For NCHW layout (used with fp32),
+    // each warp processes a full plane (64 elements), and writes a single
+    // average N*C warps are launched.
+
+    const int kTotalWarps = N * C;
+    const int kWarpsPerBlock = 8;
+    const int kBlockSize = kWarpsPerBlock * 32;
+
+    int blocks = DivUp(kTotalWarps, kWarpsPerBlock);
+    sycl_queue.parallel_for(
+        sycl::nd_range<3>(
+            sycl::range<3>(1, 1, blocks) * sycl::range<3>(1, 1, kBlockSize),
+            sycl::range<3>(1, 1, kBlockSize)),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(SYCL_SUB_GROUP_SIZE)]] {
+          globalAvgPool_kernel(output, input, prevLayerBias, N * C * kPlaneSize,
+                               N * C, C, item_ct1);
+        });
+  }
+}
+
+template <typename T>
+void globalScale(int N, int C, T* output, const T* input, const T* scaleBias,
+                 const T* prevLayerBias, bool nhwc,
+                 ActivationFunction activation, sycl::queue &sycl_queue) {
+  const bool fp16 = std::is_same<sycl::half, T>::value;
+
+  // Each thread writes one output.
+  const int kBlockSize = 256;
+  const int kBlocks = DivUp(N * 8 * 8 * C, kBlockSize);
+
+  if (nhwc) {
+    assert(fp16);
+    sycl_queue.parallel_for(
+        sycl::nd_range<3>(
+            sycl::range<3>(1, 1, kBlocks) * sycl::range<3>(1, 1, kBlockSize),
+            sycl::range<3>(1, 1, kBlockSize)),
+        [=](sycl::nd_item<3> item_ct1) {
+          globalScale_kernel_fp16_nhwc(
+              (sycl::half*)output, (sycl::half*)input, (sycl::half*)scaleBias,
+              (sycl::half*)prevLayerBias, N * C * 8 * 8, C, 8 * 8 * C,
+              activation, item_ct1);
+        });
+  } else {
+    sycl_queue.parallel_for(
+        sycl::nd_range<3>(
+            sycl::range<3>(1, 1, kBlocks) * sycl::range<3>(1, 1, kBlockSize),
+            sycl::range<3>(1, 1, kBlockSize)),
+        [=](sycl::nd_item<3> item_ct1) {
+          globalScale_kernel(output, input, scaleBias, prevLayerBias,
+                             N * C * 8 * 8, C, activation, item_ct1);
+        });
+  }
+}
+
+template <typename T>
+void policyMap_kernel(T* output, const T* input,
+                                 const short* indices, int N, int inputSize,
+                                 int usedSize, int outputSize,
+                                 const sycl::nd_item<3> &item_ct1) {
+  int tid = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
+            item_ct1.get_local_id(2);
+
+  int n = tid / usedSize;
+  int i = tid % usedSize;
+
+  if (n >= N) return;
+
+  int j = indices[i];
+
+  if (j >= 0) {
+    output[n * outputSize + j] = input[n * inputSize + i];
+  }
+}
+
+template <typename T>
+void PolicyMap(int N, T* output, const T* input, const short* indices,
+               int inputSize, int usedSize, int outputSize, sycl::queue &sycl_queue) {
+  // Each thread processes one input element
+  // Only some of the threads (with valid mapping) write output
+  const int kBlockSize = 256;
+  const int kBlocks = DivUp(N * usedSize, kBlockSize);
+
+  sycl_queue.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, kBlocks) *
+                                             sycl::range<3>(1, 1, kBlockSize),
+                                         sycl::range<3>(1, 1, kBlockSize)),
+                       [=](sycl::nd_item<3> item_ct1) {
+                         policyMap_kernel<T>((T*)output, (T*)input,
+                                             (short*)indices, N, inputSize,
+                                             usedSize, outputSize, item_ct1);
+                       });
+}
+
+template <typename T = float, bool use_se, ActivationFunction activation,
+          bool use_bias, bool use_skip>
+void OutputInputTransform(int N, int C, int se_K, T* output, const T* input,
+                          const T* skip, const T* bias, const T* w1,
+                          const T* b1, const T* w2, const T* b2, sycl::queue &sycl_queue) {
+  // Each thread processes entire chess board
+  if (use_se == false) {
+    sycl::range<3> grid_dim(1, N, DivUp(C, kOpInpTransformBlockSize));
+    {
+      
+      sycl_queue.parallel_for(
+          sycl::nd_range<3>(
+              grid_dim * sycl::range<3>(1, 1, kOpInpTransformBlockSize),
+              sycl::range<3>(1, 1, kOpInpTransformBlockSize)),
+          [=](sycl::nd_item<3> item_ct1) {
+            OutputTransform_relu_InputTransform_kernel<float, activation,
+                                                       use_bias, use_skip>(
+                N, C, output, input, (float*)skip, bias, item_ct1);
+          });
+    }
+  } else if (C > kMaxResBlockFusingChannels) {
+    throw Exception(
+        "res block fusing opt not supported for the given data type and no "
+        "of filters\n");
+  } else {
+    /*
+    DPCT1049:12: The work-group size passed to the SYCL kernel may exceed the
+    limit. To get the device limit, query info::device::max_work_group_size.
+    Adjust the work-group size if needed.
+    */
+    
+    sycl_queue.submit([&](sycl::handler& cgh) {
+      /*
+      DPCT1101:119: 'kMaxResBlockFusingChannels' expression was replaced
+      with a value. Modify the code to use the original expression, provided
+      in comments, if it is correct.
+      */
+      sycl::local_accessor<float, 1> shared_data_acc_ct1(
+          sycl::range<1>(384 /*kMaxResBlockFusingChannels*/), cgh);
+      /*
+      DPCT1101:120: 'kMaxResBlockFusingChannels / 32' expression was
+      replaced with a value. Modify the code to use the original expression,
+      provided in comments, if it is correct.
+      */
+      /*
+      DPCT1101:121: 'kMaxResBlockFusingSeK' expression was replaced with a
+      value. Modify the code to use the original expression, provided in
+      comments, if it is correct.
+      */
+      sycl::local_accessor<float, 2> shared_sums_acc_ct1(
+          sycl::range<2>(12 /*kMaxResBlockFusingChannels / 32*/,
+                         128 /*kMaxResBlockFusingSeK*/),
+          cgh);
+
+      cgh.parallel_for(
+          sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
+                            sycl::range<3>(1, 1, C)),
+          [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(SYCL_SUB_GROUP_SIZE)]] {
+            OutputTransform_SE_relu_InputTransform_kernel<float, activation,
+                                                          use_bias, use_skip>(
+                N, C, se_K, output, input, (float*)skip, bias, w1, b1, w2, b2,
+                item_ct1, shared_data_acc_ct1.get_pointer(),
+                shared_sums_acc_ct1);
+          });
+    });
+  }
+}
+
+// softmax along C dimension which is assumed to be 64
+// each thread processes two elements. Each warp computes a sum (over 64
+// elements)
+template <typename T>
+void softmax_opt_64_kernel(T* output, const T* input,
+                                      const T* input2, int N,
+                                      const sycl::nd_item<3> &item_ct1) {
+  int index = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+              item_ct1.get_local_id(2);
+  if (index >= N) return;
+
+  float x[4];
+  float ex[2];
+
+  // Load from memory
+  const bool fp16 = std::is_same<sycl::half, T>::value;
+  if (fp16) {
+    sycl::half inp[2];
+    copyAs<int>(&inp[0], &input[index * 2]);
+    x[0] = (float)inp[0];
+    x[1] = (float)inp[1];
+    if (input2 != nullptr) {
+      copyAs<int>(&inp[0], &input2[index * 2]);
+      x[2] = (float)inp[0];
+      x[3] = (float)inp[1];
+    }
+  } else {
+    copyAs<sycl::uint2>(&x[0], &input[index * 2]);
+    if (input2 != nullptr) {
+      copyAs<sycl::uint2>(&x[2], &input2[index * 2]);
+    }
+  }
+
+  if (input2 != nullptr) {
+    x[0] += x[2];
+    x[1] += x[3];
+  }
+  float threadMax = sycl::max(x[0], x[1]);
+  float maxval = warpMax(threadMax, item_ct1);
+  /*
+  DPCT1023:13: The SYCL sub-group does not support mask options for
+  dpct::select_from_sub_group. You can specify
+  "--use-experimental-features=masked-sub-group-operation" to use the
+  experimental helper function to migrate __shfl_sync.
+  */
+  maxval = sycl::select_from_group(item_ct1.get_sub_group(), maxval, 0);
+
+  ex[0] = sycl::exp(x[0] - maxval);
+  ex[1] = sycl::exp(x[1] - maxval);
+
+  float threadSum = ex[0] + ex[1];
+  float Sum = warpReduce(threadSum, item_ct1);
+  /*
+  DPCT1023:14: The SYCL sub-group does not support mask options for
+  dpct::select_from_sub_group. You can specify
+  "--use-experimental-features=masked-sub-group-operation" to use the
+  experimental helper function to migrate __shfl_sync.
+  */
+  Sum = sycl::select_from_group(item_ct1.get_sub_group(), Sum, 0);
+
+  ex[0] = ex[0] / Sum;
+  ex[1] = ex[1] / Sum;
+
+  // Store to memory
+  if (fp16) {
+    sycl::half op[2];
+    op[0] = (sycl::half)ex[0];
+    op[1] = (sycl::half)ex[1];
+    copyAs<int>(&output[index * 2], &op[0]);
+  } else {
+    copyAs<sycl::uint2>(&output[index * 2], &ex[0]);
+  }
+}
+
+// N * C Tensors
+// performs softmax along the C dimension
+// Each thread processes one element
+// Sums are computed in shared memory
+// C threads per block, N blocks
+template <typename T>
+void softmax_kernel(T* output, const T* input, const T* input2,
+                    const sycl::nd_item<3> &item_ct1, float &localsum,
+                    float &localmax) {
+  int n = item_ct1.get_group(2);
+  int c = item_ct1.get_local_id(2);
+  int C = item_ct1.get_local_range(2);
+  int index = n * C + c;
+  sycl::atomic_ref<float, sycl::memory_order::relaxed,
+                   sycl::memory_scope::work_group> maxval(localmax);
+  sycl::atomic_ref<float, sycl::memory_order::relaxed,
+                   sycl::memory_scope::work_group> sum(localsum);
+
+  // softmax = tf.exp(logits) / tf.reduce_sum(tf.exp(logits), axis)
+
+  float x = (float)input[index];
+  if (input2 != nullptr) x += (float)input2[index];
+
+  if (c == 0) {
+    sum = 0;
+    maxval = x;
+  }
+
+  
+  item_ct1.barrier(sycl::access::fence_space::local_space);
+
+  // Get max across warp first, and then update across C dimension
+  float warpmax = warpMax(x, item_ct1);
+  if ((c & 0x1F) == 0) maxval.fetch_max(warpmax);
+
+  
+  item_ct1.barrier(sycl::access::fence_space::local_space);
+
+  float ex = sycl::exp(x - maxval);
+
+  // compute warp wide sums first
+  float val = warpReduce(ex, item_ct1);
+
+  // update shared memory sum across C dimension
+  if ((c & 0x1F) == 0)
+      sum.fetch_add(val);
+
+  
+  item_ct1.barrier(sycl::access::fence_space::local_space);
+
+  float op = ex / sum;
+
+  output[index] = (T)op;
+}
+
+template <typename T>
+void Softmax(int N, int C, T* output, const T* input, const T* input2, sycl::queue &sycl_queue) {
+  if (C == 64) {
+    int size = N * 32;  // Total no of threads needed
+    const int kBlockSize = 256;
+    int blocks = DivUp(size, kBlockSize);
+    {
+      
+      sycl_queue.parallel_for(
+          sycl::nd_range<3>(
+              sycl::range<3>(1, 1, blocks) * sycl::range<3>(1, 1, kBlockSize),
+              sycl::range<3>(1, 1, kBlockSize)),
+          [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(SYCL_SUB_GROUP_SIZE)]] {
+            softmax_opt_64_kernel<T>(output, input, input2, size, item_ct1);
+          });
+    }
+  } else {
+    /*
+    DPCT1049:15: The work-group size passed to the SYCL kernel may exceed the
+    limit. To get the device limit, query info::device::max_work_group_size.
+    Adjust the work-group size if needed.
+    */
+    sycl_queue.submit([&](sycl::handler& cgh) {
+      sycl::local_accessor<float, 0> sum_acc_ct1(cgh);
+      sycl::local_accessor<float, 0> maxval_acc_ct1(cgh);
+
+      cgh.parallel_for(
+          sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
+                            sycl::range<3>(1, 1, C)),
+          [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(SYCL_SUB_GROUP_SIZE)]] {
+            softmax_kernel<T>(output, input, input2, item_ct1, sum_acc_ct1,
+                              maxval_acc_ct1);
+          });
+    });
+  }
+}
+
+[[gnu::always_inline]]
+inline float shared_sum_for_layer_norm(
+    float x, const sycl::nd_item<3>& item_ct1,
+    sycl::local_accessor<float, 2> sum) {
+  // compute warp-wide sum
+  float s = warpReduce(x, item_ct1);
+
+  // warp-wide sums
+  // Max product of the two dimension for the below array is 16 (512/32), but
+  // we make each dimension 16 for simplicity. if shared memory capacity is the
+  // bottleneck (it's not), we can convert these to single dim array and
+  // dynamically index
+
+  // compute sum across C dimension using the warp wide partial sums
+  if (item_ct1.get_local_id(2) == 0)
+      sum[item_ct1.get_local_id(0)][item_ct1.get_local_id(1)] = s;
+  
+  item_ct1.barrier(sycl::access::fence_space::local_space);
+
+  if (item_ct1.get_local_id(2) == 0 && item_ct1.get_local_id(1) == 0) {
+    float cSum = 0;
+    for (int j = 0; j < item_ct1.get_local_range(1); j++) cSum +=
+        sum[item_ct1.get_local_id(0)][j];
+    sum[item_ct1.get_local_id(0)][0] = cSum;
+  }
+  
+  item_ct1.barrier(sycl::access::fence_space::local_space);
+
+  // s now contains the sum across C dimension
+  return sum[item_ct1.get_local_id(0)][0];
+}
+
+// Each thread processes 4 elements
+// 1. Perform Bias add, and skip add
+// 2. Perform layer norm (normalize across C dimension)
+template <typename T>
+void layer_norm_kernel(int N, int C, T* output, const T* input, const T* bias,
+                       const T* skip, const T* gammas, const T* betas, float ep,
+                       float alpha, ActivationFunction act,
+                       const sycl::nd_item<3>& item_ct1,
+                       sycl::local_accessor<float, 2> sum) {
+  int n = item_ct1.get_group(2) * item_ct1.get_local_range(0) +
+          item_ct1.get_local_id(0);
+  if (n >= N) return;
+  int c = (item_ct1.get_local_id(1) * 32 + item_ct1.get_local_id(2)) * 16;
+  bool oobThread = c >= C;
+
+  int biasIndex = c;
+  int tensorIndex = n * C + c;
+
+  float val[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  float oth[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+  const bool fp16 = std::is_same<sycl::half, T>::value;
+  if (!oobThread) {
+    // Load from memory (16 elements a time)
+    if (fp16) {
+      sycl::half inp[8];
+      copyAs<sycl::uint4>(&inp[0], &input[tensorIndex]);
+      for (int i = 0; i < 8; i++) val[i] = (float)inp[i];
+      copyAs<sycl::uint4>(&inp[0], &input[tensorIndex + 8]);
+      for (int i = 0; i < 8; i++) val[i + 8] = (float)inp[i];
+      copyAs<sycl::uint4>(&inp[0], &bias[biasIndex]);
+      for (int i = 0; i < 8; i++) oth[i] = (float)inp[i];
+      copyAs<sycl::uint4>(&inp[0], &bias[biasIndex + 8]);
+      for (int i = 0; i < 8; i++) oth[i + 8] = (float)inp[i];
+      for (int i = 0; i < 16; i++) val[i] += oth[i];
+    } else {
+      copyAs<sycl::uint4>(&val[0], &input[tensorIndex]);
+      copyAs<sycl::uint4>(&val[4], &input[tensorIndex + 4]);
+      copyAs<sycl::uint4>(&val[8], &input[tensorIndex + 8]);
+      copyAs<sycl::uint4>(&val[12], &input[tensorIndex + 12]);
+      copyAs<sycl::uint4>(&oth[0], &bias[biasIndex]);
+      copyAs<sycl::uint4>(&oth[4], &bias[biasIndex + 4]);
+      copyAs<sycl::uint4>(&oth[8], &bias[biasIndex + 8]);
+      copyAs<sycl::uint4>(&oth[12], &bias[biasIndex + 12]);
+      for (int i = 0; i < 16; i++) val[i] += oth[i];
+    }
+  }
+
+  if (!oobThread) {
+    if (skip != nullptr) {
+      // Load from memory (16 elements a time)
+      if (fp16) {
+        sycl::half inp[8];
+        copyAs<sycl::uint4>(&inp[0], &skip[tensorIndex]);
+        for (int i = 0; i < 8; i++) oth[i] = (float)inp[i];
+        copyAs<sycl::uint4>(&inp[0], &skip[tensorIndex + 8]);
+        for (int i = 0; i < 8; i++) oth[i + 8] = (float)inp[i];
+      } else {
+        copyAs<sycl::uint4>(&oth[0], &skip[tensorIndex]);
+        copyAs<sycl::uint4>(&oth[4], &skip[tensorIndex + 4]);
+        copyAs<sycl::uint4>(&oth[8], &skip[tensorIndex + 8]);
+        copyAs<sycl::uint4>(&oth[12], &skip[tensorIndex + 12]);
+      }
+    }
+  }
+
+  // 1. Compute mean
+  float s = 0;
+  if (!oobThread)
+    if (skip != nullptr) {
+      for (int i = 0; i < 16; i++) {
+        val[i] = activate(val[i], act) * alpha + oth[i];
+        s += val[i];
+      }
+    } else {
+      for (int i = 0; i < 16; i++) {
+        val[i] = activate(val[i], act) * alpha;
+        s += val[i];
+      }
+    }
+
+  s = shared_sum_for_layer_norm(s, item_ct1, sum);
+  float mean = s / C;
+
+  // 2. Compute varience
+  s = 0;
+  if (!oobThread)
+    for (int i = 0; i < 16; i++) {
+      float d = val[i] - mean;
+      float d_sq = d * d;
+      s += d_sq;
+    }
+  s = shared_sum_for_layer_norm(s, item_ct1, sum);
+  float var = s / C;
+
+  if (!oobThread) {
+    // Load from memory (16 elements a time)
+    if (fp16) {
+      sycl::half inp[8];
+      copyAs<sycl::uint4>(&inp[0], &gammas[biasIndex]);
+      for (int i = 0; i < 8; i++) oth[i] = (float)inp[i];
+      copyAs<sycl::uint4>(&inp[0], &gammas[biasIndex + 8]);
+      for (int i = 0; i < 8; i++) oth[i + 8] = (float)inp[i];
+    } else {
+      copyAs<sycl::uint4>(&oth[0], &gammas[biasIndex]);
+      copyAs<sycl::uint4>(&oth[4], &gammas[biasIndex + 4]);
+      copyAs<sycl::uint4>(&oth[8], &gammas[biasIndex + 8]);
+      copyAs<sycl::uint4>(&oth[12], &gammas[biasIndex + 12]);
+    }
+  }
+
+  // 3. Normalize
+  for (int i = 0; i < 16; i++) {
+    float d = val[i] - mean;
+    float norm = d / sycl::sqrt(var + ep);
+    float op = norm * oth[i];
+    val[i] = op;
+  }
+
+  if (!oobThread) {
+    // Load from memory (16 elements a time)
+    if (fp16) {
+      sycl::half inp[8];
+      copyAs<sycl::uint4>(&inp[0], &betas[biasIndex]);
+      for (int i = 0; i < 8; i++) oth[i] = (float)inp[i];
+      copyAs<sycl::uint4>(&inp[0], &betas[biasIndex + 8]);
+      for (int i = 0; i < 8; i++) oth[i + 8] = (float)inp[i];
+    } else {
+      copyAs<sycl::uint4>(&oth[0], &betas[biasIndex]);
+      copyAs<sycl::uint4>(&oth[4], &betas[biasIndex + 4]);
+      copyAs<sycl::uint4>(&oth[8], &betas[biasIndex + 8]);
+      copyAs<sycl::uint4>(&oth[12], &betas[biasIndex + 12]);
+    }
+  }
+
+  for (int i = 0; i < 16; i++) {
+    val[i] += oth[i];
+  }
+
+  if (!oobThread) {
+    // Write to memory
+    if (fp16) {
+      sycl::half op[8];
+      for (int i = 0; i < 8; i++) op[i] = (sycl::half)val[i];
+      copyAs<sycl::uint4>(&output[tensorIndex], &op[0]);
+      for (int i = 0; i < 8; i++) op[i] = (sycl::half)val[i + 8];
+      copyAs<sycl::uint4>(&output[tensorIndex + 8], &op[0]);
+    } else {
+      copyAs<sycl::uint4>(&output[tensorIndex], &val[0]);
+      copyAs<sycl::uint4>(&output[tensorIndex + 4], &val[4]);
+      copyAs<sycl::uint4>(&output[tensorIndex + 8], &val[8]);
+      copyAs<sycl::uint4>(&output[tensorIndex + 12], &val[12]);
+    }
+  }
+}
+
+// add (optional) skip connection to input, and then perform Layer normalization
+// normalization is done across C dimension (i.e, sums and std deviations taken
+// over elements in C dim)
+template <typename T>
+void LayerNorm(int N, int C, T* output, const T* input, const T* bias,
+               const T* skip, const T* gammas, const T* betas, float ep,
+               float alpha, ActivationFunction act, sycl::queue &sycl_queue) {
+  // process 4 elements per thread to achieve close to peak memory bandwidth
+  if (C % 16 != 0) throw Exception("unsupported filter size");
+  if (C > 8192) throw Exception("unsupported filter size");
+
+  sycl::range<3> blockDim(1, 1, 1), gridDim(1, 1, 1);
+  blockDim[2] = 32;
+  blockDim[1] = DivUp(C / 16, 32);
+  blockDim[0] = 1;
+  gridDim[2] = N;
+  gridDim[1] = 1;
+  gridDim[0] = 1;
+
+  /*
+  DPCT1049:17: The work-group size passed to the SYCL kernel may exceed the
+  limit. To get the device limit, query info::device::max_work_group_size.
+  Adjust the work-group size if needed.
+  */
+  {
+    
+    sycl_queue.submit([&](sycl::handler& cgh) {
+      sycl::local_accessor<float, 2> sum_acc_ct1(sycl::range<2>(16, 16), cgh);
+
+      cgh.parallel_for(
+          sycl::nd_range<3>(gridDim * blockDim, blockDim),
+          [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(SYCL_SUB_GROUP_SIZE)]] {
+            layer_norm_kernel<T>(N, C, output, input, bias, skip, gammas, betas,
+                                 ep, alpha, act, item_ct1, sum_acc_ct1);
+          });
+    });
+  }
+}
+
+// Compute promotion logits in a single kernel
+// keys matrix is of N * 64 * C (but we use only last 8 from the 'rows'
+// dimension, so N * 8 * C)
+// ppo matrix is 4 * C (weights for dense layer / matrix multiplication)
+// policy_attn_logits matrix is N * 64 * 64, but we use only 8x8 part of it
+// from each batch dimension (so, N * 8 * 8)
+// output matrix (promotion logits) is of N * 8 * 24 size
+template <typename T>
+void promotion_logits_kernel(int C, T* output, const T* keys,
+                                        const T* ppo,
+                                        const T* policy_attn_logits,
+                                        const sycl::nd_item<3> &item_ct1,
+                                        sycl::local_accessor<float, 2> promotion_offsets) {
+  constexpr int output_stride = 64 * 64 + 8 * 24;
+  int n = item_ct1.get_group(2);     // [0..N)
+  int y = item_ct1.get_local_id(1);  // [0..8)
+  int x = item_ct1.get_local_id(2);  // [0..24)     // Can split into 8 * 3
+
+  int threadInGroup = item_ct1.get_local_id(1) * 24 + item_ct1.get_local_id(2);
+
+  // phase 1 : compute promotion_offsets by multiplying keys and ppo matrices
+  const T* keys_start =
+      keys + n * 64 * C + C * 56;  // we are interested only in last 8 out of 64
+                                   // 'rows' of keys matrix
+
+  // only 32 threads out of 192 in the group are active in this phase, and each
+  // thread computes one element of the promotion_offsets matrix
+  // TODO: opt idea1, can use more threads to reduce the length of the loop for
+  // the matrix multiply (do parallel reduction of partial sums later)
+  //       opt idea2, the below loop for matrix mul has very poor memory access
+  //       pattern, can do the loop over 32, and do parallel reductions
+  if (threadInGroup < 32) {
+    int x = threadInGroup % 4;
+    int y = threadInGroup / 4;
+
+    float S = 0;
+    for (int i = 0; i < C;
+         i++) {  // TODO: modify to loop over 32 instead of C (doing parallel
+                 // reductions for the 32 sums)
+      float a = (float)keys_start[y * C + i];
+      float b =
+          (float)ppo[x * C + i];  // weight matrix is transposed (col major)
+      S += a * b;
+    }
+
+    // write the product (promotion_offsets) in shared memory
+    promotion_offsets[x][y] = S;
+  }
+
+  /*
+  DPCT1065:69: Consider replacing sycl::nd_item::barrier() with
+  sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+  performance if there is no access to global memory.
+  */
+  item_ct1.barrier(sycl::access::fence_space::local_space);
+
+  // phase 2: add the last "row" to the other 3
+  // #knight offset is added to the other three
+  // promotion_offsets = promotion_offsets[:, :3, :] + promotion_offsets[:, 3:4,
+  // :]
+  // Only 24 threads in the group are active in this phase
+  if (threadInGroup < 32) {
+    int x = threadInGroup % 4;
+    int y = threadInGroup / 4;
+    if (x < 3) {
+      promotion_offsets[x][y] += promotion_offsets[3][y];
+    }
+  }
+
+  
+  item_ct1.barrier(sycl::access::fence_space::local_space);
+
+  // phase 3: add 8x8 chunk of policy_attn_logits matrix to promotion offsets
+  //          the output is 3x8x8 (written as 8 * 24)
+  // All threads are active in this phase and they compute one element each
+  int w = x / 3;
+  int c = x % 3;
+
+  // n_promo_logits = matmul_qk[:, -16:-8, -8:]  # default traversals from rank
+  // 7 to rank 8
+  float n_promo_logit =
+      (float)policy_attn_logits[n * output_stride + (48 + y) * 64 + (56 + w)];
+  float promo_offset = promotion_offsets[c][w];
+
+  float op = n_promo_logit + promo_offset;
+
+  output[n * output_stride + threadInGroup] = (T)op;
+}
+
+template <typename T>
+void ComputePromotionLogits(int N, int C, T* output, const T* keys,
+                            const T* ppo, const T* policy_attn_logits, sycl::queue &sycl_queue) {
+  // N blocks
+  // 8 * 24 threads
+  // Each thread computes a single output element
+  sycl::range<3> blockDim(1, 8, 24);
+  sycl_queue.submit([&](sycl::handler& cgh) {
+    sycl::local_accessor<float, 2> promotion_offsets_acc_ct1(
+        sycl::range<2>(4, 8), cgh);
+
+    cgh.parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, N) * blockDim, blockDim),
+        [=](sycl::nd_item<3> item_ct1) {
+          promotion_logits_kernel<T>(C, output, keys, ppo, policy_attn_logits,
+                                     item_ct1, promotion_offsets_acc_ct1);
+        });
+  });
+}
+
+template <typename T>
+void preprocess_for_attention_body_kernel(
+    T* output, const T* input, const T* encoding, int input_size,
+    int encoding_size, bool is_pe_dense_embedding,
+    const sycl::nd_item<3> &item_ct1) {
+  int n = item_ct1.get_group(2);
+  int hw = item_ct1.get_group(1);
+  int c = item_ct1.get_local_id(2) +
+          item_ct1.get_local_range(2) * item_ct1.get_group(0);
+  if (c >= input_size + encoding_size) return;
+
+  T op;
+  if (c >= input_size) {
+    // concatenate from position encoding array
+    if (is_pe_dense_embedding) {
+      op = (T)(encoding[n * 64 * encoding_size + hw * encoding_size + (c - input_size)]);
+    } else {
+      op = (T)(encoding[64 * hw + (c - input_size)]);
+    }
+  } else {
+    op = input[n * input_size * 64 + c * 64 + hw];  // nchw
+  }
+
+  int outputC = input_size + encoding_size;
+
+  // convert to nhwc
+  output[n * 64 * outputC + hw * outputC + c] = op;
+}
+
+template <typename T>
+void inputPreprocessForAttentionBody(T* output, const T* input,
+                                     const T* encoding, int N, int input_size,
+                                     int encoding_size,
+                                     bool is_pe_dense_embedding,
+                                     sycl::queue &sycl_queue) {
+  // N * 64 blocks
+  // (kInputPlanes + kNumPosEncodingChannels) threads
+  // Each thread computes a single output element
+  sycl::range<3> gridSize = sycl::range<3>(1, 64, N);
+  sycl::range<3> blockSize(1, 1, 1);
+  blockSize[2] = sycl::min(input_size + encoding_size, 512);
+  blockSize[1] = 1;
+  blockSize[0] = 1;
+  gridSize[0] = DivUp(input_size + encoding_size, blockSize[2]);
+
+  sycl_queue.parallel_for(
+      sycl::nd_range<3>(gridSize * blockSize, blockSize),
+      [=](sycl::nd_item<3> item_ct1) {
+        preprocess_for_attention_body_kernel<T>(output, input, encoding,
+                                                input_size, encoding_size,
+                                                is_pe_dense_embedding, item_ct1);
+      });
+}
+
+template <typename T>
+void input_gating_kernel(T* output, const T* input, const T* mult,
+                                    const T* add, int HW, int C,
+                                    const sycl::nd_item<3> &item_ct1) {
+  int n_offset = item_ct1.get_group(0) * HW * C;
+  int idx = item_ct1.get_local_id(1) * C +
+            item_ct1.get_group(2) * item_ct1.get_local_range(2) +
+            item_ct1.get_local_id(2);  // index in input
+  int idxT = (item_ct1.get_group(2) * item_ct1.get_local_range(2) +
+              item_ct1.get_local_id(2)) *
+                 HW +
+             item_ct1.get_local_id(
+                 1);  // index in transposed weights arrays mult and add.
+
+  if (idx < HW * C) {
+    // Combine multiply gating, add gating and weights transpose.
+    float op =
+        (float)input[n_offset + idx] * (float)mult[idxT] + (float)add[idxT];
+    output[n_offset + idx] = (T)op;
+  }
+}
+
+template <typename T>
+void applyInputGating(T* output, const T* input, const T* mult, const T* add,
+                      int N, int HW, int C, sycl::queue &sycl_queue) {
+  // Multiple blocks to fit into each input area / volume
+  // Block x position indicates horizontal section of area
+  // Block y position indicates batch
+  // Each thread computes a single output element
+  sycl::range<3> blockSize(1, 1, 1), gridSize(1, 1, 1);
+  blockSize[2] = DivUp(512, HW);
+  blockSize[1] = HW;
+  blockSize[0] = 1;
+  gridSize[2] = DivUp(C, blockSize[2]);
+  gridSize[1] = 1;
+  gridSize[0] = N;
+  
+  sycl_queue.parallel_for(sycl::nd_range<3>(gridSize * blockSize, blockSize),
+                       [=](sycl::nd_item<3> item_ct1) {
+                         input_gating_kernel<T>(output, input, mult, add, HW, C,
+                                                item_ct1);
+                       });
+}
+
+template<typename T, int kWorkPerThread>
+static void genOffsetPointers_kernel(T** offsets, int heads, int block_size,
+                                     int depth, int d_model, T* k, T* q, T* b1,
+                                     T* v, T* b2,
+                                     const sycl::nd_item<1>& item_ct) {
+  const int i = item_ct.get_global_id(0) * kWorkPerThread;
+  if (i >= block_size) return;
+  const int h = i % heads;
+  const int n = i / heads;
+  int w;
+  T* res[kWorkPerThread];
+  for (w = 0; w < kWorkPerThread; w++) {
+    res[w] = k + h * depth + 64 * d_model * n + w * depth;
+    offsets[i + w] = res[w];
+  }
+
+  for (w = 0; w < kWorkPerThread; w++) {
+    res[w] = q + h * depth + 64 * d_model * n + w * depth;
+    offsets[i + w + block_size] = res[w];
+  }
+
+  for (w = 0; w < kWorkPerThread; w++) {
+    res[w] = b1 + i * 64 * 64 + w * 64 * 64;
+    offsets[i + w + 2 * block_size] = res[w];
+  }
+
+  for (w = 0; w < kWorkPerThread; w++) {
+    res[w] = v + h * depth + 64 * d_model * n + w * depth;
+    offsets[i + w + 3 * block_size] = res[w];
+  }
+
+  for (w = 0; w < kWorkPerThread; w++) {
+    res[w] =  b2 + h * depth + 64 * d_model * n + w * depth;
+    offsets[i + w + 4 * block_size] = res[w];
+  }
+}
+
+template <typename T>
+void genOffsetPointers(T** offsets, int heads, int max_batch, int depth,
+                       int d_model, T* k, T* q, T* b1,
+                       T* v, T* b2, sycl::queue& sycl_queue) {
+  const int block_size = heads * max_batch;
+  // Process two elements per thread to use 128 bit store instructions.
+  constexpr int kWorkPerThread = 2;
+  constexpr int kWorkGroupSize = 128;
+  if (block_size % kWorkPerThread != 0) {
+    // Handle odd block sizes.
+    sycl::range<1> global(DivUp(block_size, kWorkGroupSize));
+    sycl::range<1> local(kWorkGroupSize);
+    sycl_queue.parallel_for(sycl::nd_range<1>(global*local, local),
+        [=](sycl::nd_item<1> item_ct) {
+        genOffsetPointers_kernel<T, 1>(offsets, heads, block_size,
+                                       depth, d_model, k, q, b1,
+                                       v, b2, item_ct);
+        });
+  } else {
+    // Handle even block size
+    sycl::range<1> global(DivUp(block_size, kWorkGroupSize*kWorkPerThread));
+    sycl::range<1> local(kWorkGroupSize);
+    sycl_queue.parallel_for(sycl::nd_range<1>(global*local, local),
+        [=](sycl::nd_item<1> item_ct) {
+        genOffsetPointers_kernel<T, kWorkPerThread>(offsets, heads, block_size,
+                                                    depth, d_model, k, q, b1,
+                                                    v, b2, item_ct);
+        });
+  }
+}
+
+// Template instantiation.
+template void copyTypeConverted<sycl::half, float>(sycl::half* op, float* ip, int N, sycl::queue &sycl_queue);
+template void copyTypeConverted<float, sycl::half>(float* op, sycl::half* ip, int N, sycl::queue &sycl_queue);
+template void copyTypeConverted<float, float>(float* op, float* ip, int N, sycl::queue &sycl_queue);
+template void copyTypeConverted<sycl::half, sycl::half>(sycl::half* op, sycl::half* ip, int N, sycl::queue &sycl_queue);
+
+template void batchNorm<float>(float* output, const float* input,
+                               const float* skipInput, int N, int C, int H,
+                               int W, float* means, float* var_multipliers,
+                               ActivationFunction activation, sycl::queue &sycl_queue);
+
+template void batchNorm<sycl::half>(sycl::half* output, const sycl::half* input,
+                              const sycl::half* skipInput, int N, int C, int H, int W,
+                              float* means, float* var_multipliers,
+                              ActivationFunction activation, sycl::queue &sycl_queue);
+
+template void addVectors<float>(float* c, float* a, float* b, int size,
+                                int asize, int bsize, ActivationFunction act, sycl::queue &sycl_queue);
+
+template void addVectors<sycl::half>(sycl::half* c, sycl::half* a, sycl::half* b, int size, int asize,
+                               int bsize, ActivationFunction act, sycl::queue &sycl_queue);
+
+template void addVectorsHNC_NHC<float>(float* a, float* b, int N, int H, int C, sycl::queue &sycl_queue);
+template void addVectorsHNC_NHC<sycl::half>(sycl::half* a, sycl::half* b, int N, int H, int C, sycl::queue &sycl_queue);
+
+template void addBiasBatched<float>(float* output, const float* input,
+                                    const float* bias, int Batch, int N, int C,
+                                    ActivationFunction activation, sycl::queue &sycl_queue);
+
+template void addBiasBatched<sycl::half>(sycl::half* output, const sycl::half* input,
+                                   const sycl::half* bias, int Batch, int N, int C,
+                                   ActivationFunction activation, sycl::queue &sycl_queue);
+
+template void addBiasBatched<float>(float* output, const float* input,
+                                    const float* bias, int Batch, int N, int C,
+                                    int Nstride, ActivationFunction activation, sycl::queue &sycl_queue);
+
+template void addBiasBatched<sycl::half>(sycl::half* output, const sycl::half* input,
+                                   const sycl::half* bias, int Batch, int N, int C,
+                                   int Nstride, ActivationFunction activation, sycl::queue &sycl_queue);
+
+template void addBias_NCHW<float>(float* c, float* a, float* b, int N, int C,
+                                  int H, int W, ActivationFunction activation, sycl::queue &sycl_queue);
+
+template void addBias_NCHW<sycl::half>(sycl::half* c, sycl::half* a, sycl::half* b, int N, int C, int H,
+                                 int W, ActivationFunction activation, sycl::queue &sycl_queue);
+
+template void globalAvgPool<float>(int N, int C, float* output,
+                                   const float* input,
+                                   const float* prevLayerBias, bool nhwc, sycl::queue &sycl_queue);
+
+template void globalAvgPool<sycl::half>(int N, int C, sycl::half* output, const sycl::half* input,
+                                  const sycl::half* prevLayerBias, bool nhwc, sycl::queue &sycl_queue);
+
+template void globalScale<float>(int N, int C, float* output,
+                                 const float* input, const float* scaleBias,
+                                 const float* prevLayerBias, bool nhwc,
+                                 ActivationFunction activation, sycl::queue &sycl_queue);
+
+template void globalScale<sycl::half>(int N, int C, sycl::half* output, const sycl::half* input,
+                                const sycl::half* scaleBias,
+                                const sycl::half* prevLayerBias, bool nhwc,
+                                ActivationFunction activation, sycl::queue &sycl_queue);
+
+template void PolicyMap<float>(int N, float* output, const float* input,
+                               const short* indices, int inputSize,
+                               int usedSize, int outputSize, sycl::queue &sycl_queue);
+
+template void PolicyMap<sycl::half>(int N, sycl::half* output, const sycl::half* input,
+                              const short* indices, int inputSize, int usedSize,
+                              int outputSize, sycl::queue &sycl_queue);
+
+template void FilterTransform<float>(int N, int C, float* transformedFilter,
+                                     const float* filter, sycl::queue &sycl_queue);
+
+template void InputTransform<float, true>(int N, int C,
+                                          float* transformed_input,
+                                          const float* input, sycl::queue &sycl_queue);
+
+template void InputTransform<float, false>(int N, int C,
+                                           float* transformed_input,
+                                           const float* input, sycl::queue &sycl_queue);
+
+template void OutputTransform<float, true, ACTIVATION_RELU, true, true, false,
+                              false>(int N, int C, int se_K, float* output,
+                                     const float* input, const float* skip,
+                                     const float* bias, const float* w1,
+                                     const float* b1, const float* w2,
+                                     const float* b2, sycl::queue &sycl_queue);
+
+template void
+OutputTransform<float, false, ACTIVATION_RELU, true, true, false, false>(
+
+    int N, int C, int se_K, float* output, const float* input,
+    const float* skip, const float* bias, const float* w1, const float* b1,
+    const float* w2, const float* b2, sycl::queue &sycl_queue);
+
+template void OutputTransform<float, true, ACTIVATION_RELU, true, true, true,
+                              false>(int N, int C, int se_K, float* output,
+                                     const float* input, const float* skip,
+                                     const float* bias, const float* w1,
+                                     const float* b1, const float* w2,
+                                     const float* b2, sycl::queue &sycl_queue);
+
+template void OutputTransform<float, false, ACTIVATION_RELU, true, true, true,
+                              false>(int N, int C, int se_K, float* output,
+                                     const float* input, const float* skip,
+                                     const float* bias, const float* w1,
+                                     const float* b1, const float* w2,
+                                     const float* b2, sycl::queue &sycl_queue);
+
+template void OutputTransform<float, false, ACTIVATION_RELU, true, false, false,
+                              false>(int N, int C, int se_K, float* output,
+                                     const float* input, const float* skip,
+                                     const float* bias, const float* w1,
+                                     const float* b1, const float* w2,
+                                     const float* b2, sycl::queue &sycl_queue);
+
+template void OutputTransform<float, false, ACTIVATION_RELU, true, false, false,
+                              true>(int N, int C, int se_K, float* output,
+                                    const float* input, const float* skip,
+                                    const float* bias, const float* w1,
+                                    const float* b1, const float* w2,
+                                    const float* b2, sycl::queue &sycl_queue);
+
+template void OutputTransform<float, true, ACTIVATION_RELU, true, true, true,
+                              true>(int N, int C, int se_K, float* output,
+                                    const float* input, const float* skip,
+                                    const float* bias, const float* w1,
+                                    const float* b1, const float* w2,
+                                    const float* b2, sycl::queue &sycl_queue);
+
+template void OutputTransform<float, true, ACTIVATION_MISH, true, true, false,
+                              false>(int N, int C, int se_K, float* output,
+                                     const float* input, const float* skip,
+                                     const float* bias, const float* w1,
+                                     const float* b1, const float* w2,
+                                     const float* b2, sycl::queue &sycl_queue);
+
+template void OutputTransform<float, false, ACTIVATION_MISH, true, true, false,
+                              false>(int N, int C, int se_K, float* output,
+                                     const float* input, const float* skip,
+                                     const float* bias, const float* w1,
+                                     const float* b1, const float* w2,
+                                     const float* b2, sycl::queue &sycl_queue);
+
+template void OutputTransform<float, true, ACTIVATION_MISH, true, true, true,
+                              false>(int N, int C, int se_K, float* output,
+                                     const float* input, const float* skip,
+                                     const float* bias, const float* w1,
+                                     const float* b1, const float* w2,
+                                     const float* b2, sycl::queue &sycl_queue);
+
+template void OutputTransform<float, false, ACTIVATION_MISH, true, true, true,
+                              false>(int N, int C, int se_K, float* output,
+                                     const float* input, const float* skip,
+                                     const float* bias, const float* w1,
+                                     const float* b1, const float* w2,
+                                     const float* b2, sycl::queue &sycl_queue);
+
+template void OutputTransform<float, false, ACTIVATION_MISH, true, false, false,
+                              false>(int N, int C, int se_K, float* output,
+                                     const float* input, const float* skip,
+                                     const float* bias, const float* w1,
+                                     const float* b1, const float* w2,
+                                     const float* b2, sycl::queue &sycl_queue);
+
+template void OutputTransform<float, false, ACTIVATION_MISH, true, false, false,
+                              true>(int N, int C, int se_K, float* output,
+                                    const float* input, const float* skip,
+                                    const float* bias, const float* w1,
+                                    const float* b1, const float* w2,
+                                    const float* b2, sycl::queue &sycl_queue);
+
+template void OutputTransform<float, true, ACTIVATION_MISH, true, true, true,
+                              true>(int N, int C, int se_K, float* output,
+                                    const float* input, const float* skip,
+                                    const float* bias, const float* w1,
+                                    const float* b1, const float* w2,
+                                    const float* b2, sycl::queue &sycl_queue);
+
+template void OutputTransform<float, false, ACTIVATION_NONE, true, false, false,
+                              false>(int N, int C, int se_K, float* output,
+                                     const float* input, const float* skip,
+                                     const float* bias, const float* w1,
+                                     const float* b1, const float* w2,
+                                     const float* b2, sycl::queue &sycl_queue);
+
+template void OutputInputTransform<float, true, ACTIVATION_RELU, true, true>(
+    int N, int C, int se_K, float* output, const float* input,
+    const float* skip, const float* bias, const float* w1, const float* b1,
+    const float* w2, const float* b2, sycl::queue &sycl_queue);
+
+template void OutputInputTransform<float, false, ACTIVATION_RELU, true, true>(
+    int N, int C, int se_K, float* output, const float* input,
+    const float* skip, const float* bias, const float* w1, const float* b1,
+    const float* w2, const float* b2, sycl::queue &sycl_queue);
+
+template void OutputInputTransform<float, false, ACTIVATION_RELU, true, false>(
+    int N, int C, int se_K, float* output, const float* input,
+    const float* skip, const float* bias, const float* w1, const float* b1,
+    const float* w2, const float* b2, sycl::queue &sycl_queue);
+
+template void OutputInputTransform<float, true, ACTIVATION_MISH, true, true>(
+    int N, int C, int se_K, float* output, const float* input,
+    const float* skip, const float* bias, const float* w1, const float* b1,
+    const float* w2, const float* b2, sycl::queue &sycl_queue);
+
+template void OutputInputTransform<float, false, ACTIVATION_MISH, true, true>(
+    int N, int C, int se_K, float* output, const float* input,
+    const float* skip, const float* bias, const float* w1, const float* b1,
+    const float* w2, const float* b2, sycl::queue &sycl_queue);
+
+template void OutputInputTransform<float, false, ACTIVATION_MISH, true, false>(
+    int N, int C, int se_K, float* output, const float* input,
+    const float* skip, const float* bias, const float* w1, const float* b1,
+    const float* w2, const float* b2, sycl::queue &sycl_queue);
+
+template void Softmax<sycl::half>(int N, int C, sycl::half* output, const sycl::half* input,
+                            const sycl::half* input2, sycl::queue &sycl_queue);
+
+template void Softmax<float>(int N, int C, float* output, const float* input,
+                             const float* input2, sycl::queue &sycl_queue);
+
+template void LayerNorm<sycl::half>(int N, int C, sycl::half* output, const sycl::half* input,
+                              const sycl::half* bias, const sycl::half* skip,
+                              const sycl::half* gammas, const sycl::half* betas, float ep,
+                              float alpha, ActivationFunction act, sycl::queue &sycl_queue);
+
+template void LayerNorm<float>(int N, int C, float* output, const float* input,
+                               const float* bias, const float* skip,
+                               const float* gammas, const float* betas,
+                               float ep, float alpha, ActivationFunction act, sycl::queue &sycl_queue);
+
+template void ComputePromotionLogits<sycl::half>(int N, int C, sycl::half* output,
+                                           const sycl::half* keys, const sycl::half* ppo,
+                                           const sycl::half* policy_attn_logits, sycl::queue &sycl_queue);
+
+template void ComputePromotionLogits<float>(int N, int C, float* output,
+                                            const float* keys, const float* ppo,
+                                            const float* policy_attn_logits, sycl::queue &sycl_queue);
+
+template void convertNCHWtoNHWC<sycl::half, float>(sycl::half* output_tensor,
+                                             const float* input_tensor, int Nin,
+                                             int Cin, int Nout, int Cout, int H,
+                                             int W, sycl::queue &sycl_queue);
+
+template void convertNCHWtoNHWC<float, float>(float* output_tensor,
+                                              const float* input_tensor,
+                                              int Nin, int Cin, int Nout,
+                                              int Cout, int H, int W, sycl::queue &sycl_queue);
+
+template void convertNCHWtoNHWC<sycl::half, sycl::half>(sycl::half* output_tensor,
+                                            const sycl::half* input_tensor, int Nin,
+                                            int Cin, int Nout, int Cout, int H,
+                                            int W, sycl::queue &sycl_queue);
+
+template void inputPreprocessForAttentionBody<sycl::half>(
+    sycl::half* output, const sycl::half* input, const sycl::half* encoding, int N,
+    int input_size, int encoding_size, bool is_pe_dense_embedding,
+    sycl::queue &sycl_queue);
+
+template void inputPreprocessForAttentionBody<float>(
+    float* output, const float* input, const float* encoding, int N,
+    int input_size, int encoding_size, bool is_pe_dense_embedding,
+    sycl::queue &sycl_queue);
+
+template void applyInputGating<sycl::half>(sycl::half* output, const sycl::half* input,
+                                     const sycl::half* mult, const sycl::half* add, int N,
+                                     int C, int output_size, sycl::queue &sycl_queue);
+
+template void applyInputGating<float>(float* output, const float* input,
+                                      const float* mult, const float* add,
+                                      int N, int C, int output_size, sycl::queue &sycl_queue);
+
+template void genOffsetPointers<float>(float** offsets, int heads, int max_batch, int depth,
+                       int d_model, float* k, float* q, float* b1,
+                       float* v, float* b2, sycl::queue& sycl_queue);
+
+template void genOffsetPointers<sycl::half>(sycl::half** offsets, int heads, int max_batch, int depth,
+                       int d_model, sycl::half* k, sycl::half* q, sycl::half* b1,
+                       sycl::half* v, sycl::half* b2, sycl::queue& sycl_queue);
+}  // namespace sycldnn_backend
+}  // namespace lczero
diff --git a/src/neural/backends/sycl/cuBlasContext.h b/src/neural/backends/sycl/cuBlasContext.h
new file mode 100644
index 0000000000..f330ce8150
--- /dev/null
+++ b/src/neural/backends/sycl/cuBlasContext.h
@@ -0,0 +1,98 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2023 Intel Corporation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+   
+  SPDX-License-Identifier:GNU General Public License v3.0 or later
+*/
+
+#include <iostream>
+#include <sycl/sycl.hpp>
+
+#ifdef USE_CUBLAS
+
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+class cuBlasContextManager;
+static cuBlasContextManager *_cuBlasContextManager;
+
+class cuBlasContextManager{
+     
+    //~cuBlasContextManager() { cublasDestroy(handle); }
+    cublasHandle_t handle;
+
+    cuBlasContextManager() {
+        cublasCreate(&handle);
+    }
+
+    public:
+     static cublasHandle_t getcuBlasHandle_t(){
+        if(_cuBlasContextManager == NULL){
+            _cuBlasContextManager = new cuBlasContextManager(); 
+        }
+        return _cuBlasContextManager->handle;
+    }
+
+    static cublasHandle_t destroycuBlasHandle_t(){
+        if(_cuBlasContextManager != NULL){
+           cublasDestroy(_cuBlasContextManager->getcuBlasHandle_t()); 
+           free(_cuBlasContextManager); 
+        }
+
+        return _cuBlasContextManager->handle;
+    }
+};
+
+#elif defined(USE_HIPBLAS)
+
+
+#include "hip/hip_runtime.h" 
+#include "hipblas/hipblas.h"
+
+class hipBlasContextManager;
+static hipBlasContextManager *_hipBlasContextManager;
+
+class hipBlasContextManager{
+     
+    //~cuBlasContextManager() { cublasDestroy(handle); }
+    hipblasHandle_t handle;
+
+    hipBlasContextManager() {
+        hipblasCreate(&handle);
+    }
+
+    public:
+     static hipblasHandle_t gethipBlasHandle_t(){
+        if(_hipBlasContextManager == NULL){
+            _hipBlasContextManager = new hipBlasContextManager(); 
+        }
+        return _hipBlasContextManager->handle;
+    }
+
+    static hipblasHandle_t destroycuBlasHandle_t(){
+        if(_hipBlasContextManager != NULL){
+           hipblasDestroy(_hipBlasContextManager->gethipBlasHandle_t()); 
+           free(_hipBlasContextManager); 
+        }
+
+        return _hipBlasContextManager->handle;
+    }
+};
+
+
+
+#endif
diff --git a/src/neural/backends/sycl/fp16_kernels.dp.cpp b/src/neural/backends/sycl/fp16_kernels.dp.cpp
new file mode 100644
index 0000000000..a6921e9733
--- /dev/null
+++ b/src/neural/backends/sycl/fp16_kernels.dp.cpp
@@ -0,0 +1,944 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2018-2024 The LCZero Authors
+  Copyright (C) 2023 Intel Corporation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+   
+  SPDX-License-Identifier:GNU General Public License v3.0 or later
+*/
+
+#include <sycl/sycl.hpp>
+#include "sycl_common.h"
+#include "neural/backends/shared/activation.h"
+
+#include "winograd_helper.h"
+
+namespace lczero {
+namespace sycldnn_backend {
+
+/////////////////////////////////////////////////////////////////////////////
+//          fp16-specific kernels used by certain layers                   //
+/////////////////////////////////////////////////////////////////////////////
+
+// SE layer implementation using single fused kernel.
+
+// N blocks.
+// C threads per block.
+// 'HWC' input data processed by thread block.
+// Each thread processes 8x8 elements.
+// K is the no. of outputs of first fully connected layer (same as no. of inputs
+// for second fully connected layer).
+// The kernel assumes K <= C.
+
+template <int C, int K>
+/*
+DPCT1110:20: The total declared local variable size in device function
+SE_Layer_NHWC exceeds 128 bytes and may cause high register pressure. Consult
+with your hardware vendor to find the total register size available and adjust
+the code, or use smaller sub-group size to avoid high register pressure.
+*/
+void SE_Layer_NHWC(sycl::half* output, const sycl::half* skip,
+                   const sycl::half* input, const sycl::half* w1,
+                   const sycl::half* b1, const sycl::half* w2,
+                   const sycl::half* b2, const sycl::half* bPrev,
+                   ActivationFunction activation,
+                   const sycl::nd_item<3>& item_ct1, sycl::half* sharedData) {
+#if DPCT_COMPATIBILITY_TEMP >= 530
+  const int elementsPerThread = 64;  // 8x8 board
+  const int se_K = K;
+
+  int n = item_ct1.get_group(2);
+  int c = item_ct1.get_local_id(2);
+
+  sycl::half2 localData[elementsPerThread];
+
+  sycl::half S = 0;
+
+  sycl::half bias = 0;
+  if (bPrev) bias = bPrev[c];
+
+// 1. Global avg (1 avg per thread).
+#pragma unroll
+  for (int i = 0; i < elementsPerThread; i++) {
+    int localIndex = i * C + c;
+    int inputIndex = n * C * elementsPerThread + localIndex;
+    localData[i].x() = input[inputIndex] + bias;
+    localData[i].y() = skip[inputIndex];
+    S += localData[i].x();
+  }
+
+  sycl::half avg = S / (sycl::half)elementsPerThread;
+  sharedData[c] = avg;
+
+  item_ct1.barrier(sycl::access::fence_space::local_space);
+
+  // 2. First fully connected layer.
+  if (c < K) {
+    S = 0;
+
+#pragma unroll
+    for (int i = 0; i < C; i++) {
+      S += sharedData[i] * readw1(i, c);
+    }
+
+    S += b1[c];
+
+    S = activate(S, activation);
+
+    sharedData[c] = S;
+  }
+  item_ct1.barrier(sycl::access::fence_space::local_space);
+
+  // 3. Second fully connected layer.
+  S = 0;
+  sycl::half B = 0;
+#pragma unroll
+  for (int i = 0; i < K; i++) {
+    sycl::half val = sharedData[i];
+    S += val * readw2(i, c);
+    B += val * readw2(i, c + C);
+  }
+  S += b2[c];
+  B += b2[c + C];
+
+  // Sigmoid (only on the scale part).
+  S = (sycl::half)(1.0f / (1.0f + sycl::exp(-(float)(S))));
+
+// 4. Scale, and add skip connection, perform relu, and write to output.
+#pragma unroll
+  for (int i = 0; i < elementsPerThread; i++) {
+    int localIndex = i * C + c;
+    int inputIndex = n * C * elementsPerThread + localIndex;
+    sycl::half val = localData[i].y() + localData[i].x() * S + B;
+
+    // Relu activation function.
+    val = (sycl::half)activate((float)val, activation);
+
+    output[inputIndex] = val;
+  }
+#endif
+}
+
+bool Se_Fp16_NHWC(int N, int C, int numFc1Out, sycl::half* output,
+                  const sycl::half* skip, const sycl::half* input,
+                  const sycl::half* w1, const sycl::half* b1,
+                  const sycl::half* w2, const sycl::half* b2,
+                  const sycl::half* bPrev, ActivationFunction activation, sycl::queue &sycl_queue) {
+  // TODO: Think of more elegant way to avoid this hardcoding :-/
+  if (numFc1Out == 16) {
+    if (C == 64) {
+      /*
+      DPCT1049:21: The work-group size passed to the SYCL kernel may exceed the
+      limit. To get the device limit, query info::device::max_work_group_size.
+      Adjust the work-group size if needed.
+      */
+      
+      sycl_queue.submit([&](sycl::handler& cgh) {
+        sycl::local_accessor<sycl::half, 1> sharedData_acc_ct1(
+            sycl::range<1>(64), cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
+                              sycl::range<3>(1, 1, C)),
+            [=](sycl::nd_item<3> item_ct1) {
+              SE_Layer_NHWC<64, 16>(output, skip, input, w1, b1, w2, b2, bPrev,
+                                    activation, item_ct1,
+                                    sharedData_acc_ct1.get_pointer());
+            });
+      });
+    } else {
+      // TODO: support other channel counts.
+      throw Exception("channel count unsupported by SE layer");
+    }
+  } else if (numFc1Out == 32) {
+    if (C == 64) {
+      /*
+      DPCT1049:22: The work-group size passed to the SYCL kernel may exceed the
+      limit. To get the device limit, query info::device::max_work_group_size.
+      Adjust the work-group size if needed.
+      */
+      
+      sycl_queue.submit([&](sycl::handler& cgh) {
+        sycl::local_accessor<sycl::half, 1> sharedData_acc_ct1(
+            sycl::range<1>(64), cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
+                              sycl::range<3>(1, 1, C)),
+            [=](sycl::nd_item<3> item_ct1) {
+              SE_Layer_NHWC<64, 32>(output, skip, input, w1, b1, w2, b2, bPrev,
+                                    activation, item_ct1,
+                                    sharedData_acc_ct1.get_pointer());
+            });
+      });
+    } else if (C == 128) {
+      /*
+      DPCT1049:23: The work-group size passed to the SYCL kernel may exceed the
+      limit. To get the device limit, query info::device::max_work_group_size.
+      Adjust the work-group size if needed.
+      */
+      
+      sycl_queue.submit([&](sycl::handler& cgh) {
+        sycl::local_accessor<sycl::half, 1> sharedData_acc_ct1(
+            sycl::range<1>(128), cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
+                              sycl::range<3>(1, 1, C)),
+            [=](sycl::nd_item<3> item_ct1) {
+              SE_Layer_NHWC<128, 32>(output, skip, input, w1, b1, w2, b2, bPrev,
+                                     activation, item_ct1,
+                                     sharedData_acc_ct1.get_pointer());
+            });
+      });
+    } else if (C == 192) {
+      /*
+      DPCT1049:24: The work-group size passed to the SYCL kernel may exceed the
+      limit. To get the device limit, query info::device::max_work_group_size.
+      Adjust the work-group size if needed.
+      */
+      
+      sycl_queue.submit([&](sycl::handler& cgh) {
+        sycl::local_accessor<sycl::half, 1> sharedData_acc_ct1(
+            sycl::range<1>(192), cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
+                              sycl::range<3>(1, 1, C)),
+            [=](sycl::nd_item<3> item_ct1) {
+              SE_Layer_NHWC<192, 32>(output, skip, input, w1, b1, w2, b2, bPrev,
+                                     activation, item_ct1,
+                                     sharedData_acc_ct1.get_pointer());
+            });
+      });
+    } else if (C == 256) {
+      /*
+      DPCT1049:25: The work-group size passed to the SYCL kernel may exceed the
+      limit. To get the device limit, query info::device::max_work_group_size.
+      Adjust the work-group size if needed.
+      */
+      
+      sycl_queue.submit([&](sycl::handler& cgh) {
+        sycl::local_accessor<sycl::half, 1> sharedData_acc_ct1(
+            sycl::range<1>(256), cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
+                              sycl::range<3>(1, 1, C)),
+            [=](sycl::nd_item<3> item_ct1) {
+              SE_Layer_NHWC<256, 32>(output, skip, input, w1, b1, w2, b2, bPrev,
+                                     activation, item_ct1,
+                                     sharedData_acc_ct1.get_pointer());
+            });
+      });
+    } else if (C == 320) {
+      /*
+      DPCT1049:26: The work-group size passed to the SYCL kernel may exceed the
+      limit. To get the device limit, query info::device::max_work_group_size.
+      Adjust the work-group size if needed.
+      */
+      
+      sycl_queue.submit([&](sycl::handler& cgh) {
+        sycl::local_accessor<sycl::half, 1> sharedData_acc_ct1(
+            sycl::range<1>(320), cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
+                              sycl::range<3>(1, 1, C)),
+            [=](sycl::nd_item<3> item_ct1) {
+              SE_Layer_NHWC<320, 32>(output, skip, input, w1, b1, w2, b2, bPrev,
+                                     activation, item_ct1,
+                                     sharedData_acc_ct1.get_pointer());
+            });
+      });
+    } else if (C == 352) {
+      /*
+      DPCT1049:27: The work-group size passed to the SYCL kernel may exceed the
+      limit. To get the device limit, query info::device::max_work_group_size.
+      Adjust the work-group size if needed.
+      */
+      
+      sycl_queue.submit([&](sycl::handler& cgh) {
+        sycl::local_accessor<sycl::half, 1> sharedData_acc_ct1(
+            sycl::range<1>(352), cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
+                              sycl::range<3>(1, 1, C)),
+            [=](sycl::nd_item<3> item_ct1) {
+              SE_Layer_NHWC<352, 32>(output, skip, input, w1, b1, w2, b2, bPrev,
+                                     activation, item_ct1,
+                                     sharedData_acc_ct1.get_pointer());
+            });
+      });
+    } else if (C == 384) {
+      /*
+      DPCT1049:28: The work-group size passed to the SYCL kernel may exceed the
+      limit. To get the device limit, query info::device::max_work_group_size.
+      Adjust the work-group size if needed.
+      */
+      
+      sycl_queue.submit([&](sycl::handler& cgh) {
+        sycl::local_accessor<sycl::half, 1> sharedData_acc_ct1(
+            sycl::range<1>(384), cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
+                              sycl::range<3>(1, 1, C)),
+            [=](sycl::nd_item<3> item_ct1) {
+              SE_Layer_NHWC<384, 32>(output, skip, input, w1, b1, w2, b2, bPrev,
+                                     activation, item_ct1,
+                                     sharedData_acc_ct1.get_pointer());
+            });
+      });
+    } else {
+      // TODO: support other channel counts.
+      return false;
+    }
+  } else if (numFc1Out == 64) {
+    if (C == 64) {
+      /*
+      DPCT1049:29: The work-group size passed to the SYCL kernel may exceed the
+      limit. To get the device limit, query info::device::max_work_group_size.
+      Adjust the work-group size if needed.
+      */
+      
+      sycl_queue.submit([&](sycl::handler& cgh) {
+        sycl::local_accessor<sycl::half, 1> sharedData_acc_ct1(
+            sycl::range<1>(64), cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
+                              sycl::range<3>(1, 1, C)),
+            [=](sycl::nd_item<3> item_ct1) {
+              SE_Layer_NHWC<64, 64>(output, skip, input, w1, b1, w2, b2, bPrev,
+                                    activation, item_ct1,
+                                    sharedData_acc_ct1.get_pointer());
+            });
+      });
+    } else if (C == 128) {
+      /*
+      DPCT1049:30: The work-group size passed to the SYCL kernel may exceed the
+      limit. To get the device limit, query info::device::max_work_group_size.
+      Adjust the work-group size if needed.
+      */
+      
+      sycl_queue.submit([&](sycl::handler& cgh) {
+        sycl::local_accessor<sycl::half, 1> sharedData_acc_ct1(
+            sycl::range<1>(128), cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
+                              sycl::range<3>(1, 1, C)),
+            [=](sycl::nd_item<3> item_ct1) {
+              SE_Layer_NHWC<128, 64>(output, skip, input, w1, b1, w2, b2, bPrev,
+                                     activation, item_ct1,
+                                     sharedData_acc_ct1.get_pointer());
+            });
+      });
+    } else if (C == 192) {
+      /*
+      DPCT1049:31: The work-group size passed to the SYCL kernel may exceed the
+      limit. To get the device limit, query info::device::max_work_group_size.
+      Adjust the work-group size if needed.
+      */
+      
+      sycl_queue.submit([&](sycl::handler& cgh) {
+        sycl::local_accessor<sycl::half, 1> sharedData_acc_ct1(
+            sycl::range<1>(192), cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
+                              sycl::range<3>(1, 1, C)),
+            [=](sycl::nd_item<3> item_ct1) {
+              SE_Layer_NHWC<192, 64>(output, skip, input, w1, b1, w2, b2, bPrev,
+                                     activation, item_ct1,
+                                     sharedData_acc_ct1.get_pointer());
+            });
+      });
+    } else if (C == 256) {
+      /*
+      DPCT1049:32: The work-group size passed to the SYCL kernel may exceed the
+      limit. To get the device limit, query info::device::max_work_group_size.
+      Adjust the work-group size if needed.
+      */
+      
+      sycl_queue.submit([&](sycl::handler& cgh) {
+        sycl::local_accessor<sycl::half, 1> sharedData_acc_ct1(
+            sycl::range<1>(256), cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
+                              sycl::range<3>(1, 1, C)),
+            [=](sycl::nd_item<3> item_ct1) {
+              SE_Layer_NHWC<256, 64>(output, skip, input, w1, b1, w2, b2, bPrev,
+                                     activation, item_ct1,
+                                     sharedData_acc_ct1.get_pointer());
+            });
+      });
+    } else if (C == 320) {
+      /*
+      DPCT1049:33: The work-group size passed to the SYCL kernel may exceed the
+      limit. To get the device limit, query info::device::max_work_group_size.
+      Adjust the work-group size if needed.
+      */
+      
+      sycl_queue.submit([&](sycl::handler& cgh) {
+        sycl::local_accessor<sycl::half, 1> sharedData_acc_ct1(
+            sycl::range<1>(320), cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
+                              sycl::range<3>(1, 1, C)),
+            [=](sycl::nd_item<3> item_ct1) {
+              SE_Layer_NHWC<320, 64>(output, skip, input, w1, b1, w2, b2, bPrev,
+                                     activation, item_ct1,
+                                     sharedData_acc_ct1.get_pointer());
+            });
+      });
+    } else if (C == 384) {
+      /*
+      DPCT1049:34: The work-group size passed to the SYCL kernel may exceed the
+      limit. To get the device limit, query info::device::max_work_group_size.
+      Adjust the work-group size if needed.
+      */
+      
+      sycl_queue.submit([&](sycl::handler& cgh) {
+        sycl::local_accessor<sycl::half, 1> sharedData_acc_ct1(
+            sycl::range<1>(384), cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
+                              sycl::range<3>(1, 1, C)),
+            [=](sycl::nd_item<3> item_ct1) {
+              SE_Layer_NHWC<384, 64>(output, skip, input, w1, b1, w2, b2, bPrev,
+                                     activation, item_ct1,
+                                     sharedData_acc_ct1.get_pointer());
+            });
+      });
+    } else {
+      // TODO: support other channel counts.
+      return false;
+    }
+  } else {
+    // TODO: support other sizes.
+    return false;
+  }
+  return true;
+}
+
+// Get board for this thread from shared memory.
+// We are just using shared memory to store local thread data in this kernel to
+// help reduce some register pressure and spills to local memory.
+#define BOARD(y, x) shboard[(y)*8 + (x)]
+
+// input is in transformed space (HWNC layout) --- output of GEMM
+// output is also in transformed space (HWNC layout) --- input to GEMM (for
+// next layer)
+// 'C' threads per block
+// 'N' blocks
+// Every thread generates an entire board/plane (8x8 elements).
+template <ActivationFunction activation, bool use_bias, bool use_skip>
+/*
+DPCT1110:35: The total declared local variable size in device function
+OutputInputTransformKernel_fp16_shmem_board exceeds 128 bytes and may cause high
+register pressure. Consult with your hardware vendor to find the total register
+size available and adjust the code, or use smaller sub-group size to avoid high
+register pressure.
+*/
+void OutputInputTransformKernel_fp16_shmem_board(
+    int N, int C, int se_K, sycl::half* output, const sycl::half* input,
+    sycl::half* skip, const sycl::half* bias, const sycl::half* w1,
+    const sycl::half* b1, const sycl::half* w2, const sycl::half* b2,
+    const sycl::nd_item<3>& item_ct1, uint8_t* dpct_local, float* shared_data,
+    sycl::local_accessor<float, 2> shared_sums) {
+#if DPCT_COMPATIBILITY_TEMP >= 530
+  int k = item_ct1.get_local_id(2);
+  int n = item_ct1.get_group(2);
+
+  auto _sboard = (sycl::half*)dpct_local;
+  sycl::half* shboard = &_sboard[k * 72];  // 72 instead of 64 to reduce shared
+                                           // memory bank conflicts.
+  sycl::half b = bias[k];
+
+#pragma unroll
+  for (int hStart = 0; hStart < 8; hStart += 4)
+#pragma unroll
+    for (int wStart = 0; wStart < 8; wStart += 4) {
+      //  i) read to per thread registers (for doing output transform)
+      int shln = n * 4 + (hStart / 4) * 2 + (wStart / 4);
+      sycl::half outElTransformed[6][6];
+#pragma unroll
+      for (int y = 0; y < 6; y++)
+#pragma unroll
+        for (int x = 0; x < 6; x++)
+          outElTransformed[y][x] = input[TEMP_INDEX_HWNC(y, x, shln, k)];
+
+      // ii) transform it
+      sycl::half outEl[4][4];
+      OutputTransform4x4(&outEl[0][0], &outElTransformed[0][0]);
+
+#pragma unroll
+      for (int y = 0; y < 4; y++)
+        copyAs<sycl::uint2>(&BOARD(hStart + y, wStart), &outEl[y][0]);
+    }
+
+  // Add bias, and compute the average for SE.
+  float S = 0;
+  float B = 0;
+
+#pragma unroll
+  for (int y = 0; y < 8; y++) {
+    sycl::half boardRow[8];
+    copyAs<sycl::uint4>(&boardRow, &BOARD(y, 0));
+#pragma unroll
+    for (int x = 0; x < 8; x++) {
+      if (use_bias) boardRow[x] += b;
+      S += (float)boardRow[x];
+    }
+    if (use_bias) copyAs<sycl::uint4>(&BOARD(y, 0), &boardRow);
+  }
+
+  float avg = S / 64;
+  shared_data[k] = avg;
+
+  int lane = k & 0x1F;
+  int warp = k >> 5;
+  item_ct1.barrier(sycl::access::fence_space::local_space);
+
+  // First fully-connected layer for SE
+
+  // As se_K << C, we want to loop over se_K instead of C
+  // even if it means taking the sum across threads
+
+    // per-warp sums
+
+  for (int i = 0; i < se_K; i++) {
+    float val = shared_data[k] * float(readw1(k, i));
+    val = warpReduce(val, item_ct1);
+    if (lane == 0) shared_sums[warp][i] = val;
+  }
+  item_ct1.barrier(sycl::access::fence_space::local_space);
+  if (k < se_K) {
+    S = 0;
+    for (int i = 0; i < C / 32; i++) S += shared_sums[i][k];
+
+    S += (float)b1[k];
+    S = activate(S, activation);
+    shared_data[k] = S;
+  }
+
+  item_ct1.barrier(sycl::access::fence_space::local_space);
+
+  // Second fully-connected layer for SE
+  S = 0;
+  for (int i = 0; i < se_K; i++) {
+    float val = shared_data[i];
+    S += val * float(readw2(i, k));
+    B += val * float(readw2(i, k + C));
+  }
+  S += (float)b2[k];
+  B += (float)b2[k + C];
+
+  // Sigmoid (only on the scale part).
+  S = 1.0f / (1.0f + sycl::exp(-S));
+
+  // Scale/bias, add skip connection, perform activation, and write to output.
+  for (int h = 0; h < 8; h++) {
+    sycl::half boardRow[8];
+    copyAs<sycl::uint4>(&boardRow[0], &BOARD(h, 0));
+
+#pragma unroll
+    for (int w = 0; w < 8; w++) {
+      boardRow[w] = (sycl::half)(float(boardRow[w]) * S + B);
+    }
+
+    // residual add
+    if (use_skip) {
+      sycl::half skipInp[8];
+      copyAs<sycl::uint4>(&skipInp[0], &skip[INDEX_NHCW(n, k, h, 0)]);
+#pragma unroll
+      for (int w = 0; w < 8; w++) boardRow[w] += skipInp[w];
+    }
+
+    if (activation != ACTIVATION_NONE) {
+#pragma unroll
+      for (int w = 0; w < 8; w++)
+        boardRow[w] = (sycl::half)activate((float)boardRow[w], activation);
+    }
+
+    // write un-transformed output to 'skip' if required
+    if (use_skip) {
+      copyAs<sycl::uint4>(&skip[INDEX_NHCW(n, k, h, 0)], &boardRow[0]);
+    }
+
+    copyAs<sycl::uint4>(&BOARD(h, 0), &boardRow);
+  }
+
+  // Perform input transform.
+
+  int c = k;
+  // top-left
+  {
+    sycl::half inEl[6][6] = {};
+
+#pragma unroll
+    for (int i = 0; i < 5; i++)
+#pragma unroll
+      for (int j = 0; j < 5; j++) inEl[i + 1][j + 1] = BOARD(i, j);
+
+    InputTransform4x4(&inEl[0][0], &inEl[0][0]);
+
+#pragma unroll
+    for (int y = 0; y < 6; y++)
+#pragma unroll
+      for (int x = 0; x < 6; x++)
+        output[TEMP_INDEX_HWNC(y, x, n * 4 + 0, c)] = inEl[y][x];
+  }
+
+  // top-right
+  {
+    sycl::half inEl[6][6] = {};
+
+#pragma unroll
+    for (int i = 0; i < 5; i++)
+#pragma unroll
+      for (int j = 0; j < 5; j++) inEl[i + 1][j] = BOARD(i, j + 3);
+
+    InputTransform4x4(&inEl[0][0], &inEl[0][0]);
+
+#pragma unroll
+    for (int y = 0; y < 6; y++)
+#pragma unroll
+      for (int x = 0; x < 6; x++)
+        output[TEMP_INDEX_HWNC(y, x, n * 4 + 1, c)] = inEl[y][x];
+  }
+
+  // bottom-left
+  {
+    sycl::half inEl[6][6] = {};
+
+#pragma unroll
+    for (int i = 0; i < 5; i++)
+#pragma unroll
+      for (int j = 0; j < 5; j++) inEl[i][j + 1] = BOARD(i + 3, j);
+
+    InputTransform4x4(&inEl[0][0], &inEl[0][0]);
+
+#pragma unroll
+    for (int y = 0; y < 6; y++)
+#pragma unroll
+      for (int x = 0; x < 6; x++)
+        output[TEMP_INDEX_HWNC(y, x, n * 4 + 2, c)] = inEl[y][x];
+  }
+
+  // bottom-right
+  {
+    sycl::half inEl[6][6] = {};
+
+#pragma unroll
+    for (int i = 0; i < 5; i++)
+#pragma unroll
+      for (int j = 0; j < 5; j++) inEl[i][j] = BOARD(i + 3, j + 3);
+
+    InputTransform4x4(&inEl[0][0], &inEl[0][0]);
+
+#pragma unroll
+    for (int y = 0; y < 6; y++)
+#pragma unroll
+      for (int x = 0; x < 6; x++)
+        output[TEMP_INDEX_HWNC(y, x, n * 4 + 3, c)] = inEl[y][x];
+  }
+#endif
+}
+
+template <typename T = sycl::half, bool use_se, ActivationFunction activation,
+          bool use_bias, bool use_skip>
+void OutputInputTransform(int N, int C, int se_K, T* output, const T* input,
+                          const T* skip, const T* bias, const T* w1,
+                          const T* b1, const T* w2, const T* b2, sycl::queue &sycl_queue) {
+  // Each thread processes entire chess board.
+  if (use_se == false) {
+    sycl::range<3> grid_dim(1, N, DivUp(C, kOpInpTransformBlockSize));
+    {
+      
+      sycl_queue.parallel_for(
+          sycl::nd_range<3>(
+              grid_dim * sycl::range<3>(1, 1, kOpInpTransformBlockSize),
+              sycl::range<3>(1, 1, kOpInpTransformBlockSize)),
+          [=](sycl::nd_item<3> item_ct1) {
+            OutputTransform_relu_InputTransform_kernel<sycl::half, activation,
+                                                       use_bias, use_skip>(
+                N, C, output, input, (sycl::half*)skip, bias, item_ct1);
+          });
+    }
+  } else if (C > kMaxResBlockFusingChannels) {
+    // Use special kernel with reduced register pressure - only works on Ampere,
+    // and only for fp16.
+    if (C <= kMaxResBlockFusingSeKFp16Ampere) {
+      //cudaFuncSetAttribute(
+        //  OutputInputTransformKernel_fp16_shmem_board<activation, use_bias,
+                                                    //  use_skip>,
+         // cudaFuncAttributeMaxDynamicSharedMemorySize,
+         // 72 * C * sizeof(sycl::half));
+      /*
+      DPCT1049:36: The work-group size passed to the SYCL kernel may exceed the
+      limit. To get the device limit, query info::device::max_work_group_size.
+      Adjust the work-group size if needed.
+      */
+      {
+        
+        
+        sycl_queue.submit([&](sycl::handler& cgh) {
+          /*
+          DPCT1083:124: The size of local memory in the migrated code may be
+          different from the original code. Check that the allocated memory
+          size in the migrated code is correct.
+          */
+          sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
+              sycl::range<1>(72 * C * sizeof(sycl::half)), cgh);
+          /*
+          DPCT1101:125: 'kMaxResBlockFusingSeKFp16Ampere' expression was
+          replaced with a value. Modify the code to use the original
+          expression, provided in comments, if it is correct.
+          */
+          sycl::local_accessor<float, 1> shared_data_acc_ct1(
+              sycl::range<1>(512 /*kMaxResBlockFusingSeKFp16Ampere*/), cgh);
+          /*
+          DPCT1101:126: 'kMaxResBlockFusingSeKFp16Ampere / 32' expression was
+          replaced with a value. Modify the code to use the original
+          expression, provided in comments, if it is correct.
+          */
+          /*
+          DPCT1101:127: 'kMaxResBlockFusingSeK' expression was replaced with a
+          value. Modify the code to use the original expression, provided in
+          comments, if it is correct.
+          */
+          sycl::local_accessor<float, 2> shared_sums_acc_ct1(
+              sycl::range<2>(16 /*kMaxResBlockFusingSeKFp16Ampere / 32*/,
+                             128 /*kMaxResBlockFusingSeK*/),
+              cgh);
+
+          cgh.parallel_for(
+              sycl::nd_range<3>(
+                  sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
+                  sycl::range<3>(1, 1, C)),
+              [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(
+                  SYCL_SUB_GROUP_SIZE)]] {
+                OutputInputTransformKernel_fp16_shmem_board<activation,
+                                                            use_bias, use_skip>(
+                    N, C, se_K, (sycl::half*)output, (const sycl::half*)input,
+                    (sycl::half*)skip, (sycl::half*)bias, (sycl::half*)w1,
+                    (sycl::half*)b1, (sycl::half*)w2, (sycl::half*)b2, item_ct1,
+                    dpct_local_acc_ct1.get_pointer(),
+                    shared_data_acc_ct1.get_pointer(), shared_sums_acc_ct1);
+              });
+        });
+      }
+    } else {
+      throw Exception(
+          "res block fusing opt not supported for the given data type and no "
+          "of filters\n");
+    }
+  } else {
+    /*
+    DPCT1049:37: The work-group size passed to the SYCL kernel may exceed the
+    limit. To get the device limit, query info::device::max_work_group_size.
+    Adjust the work-group size if needed.
+    */
+    
+    sycl_queue.submit([&](sycl::handler& cgh) {
+      /*
+      DPCT1101:128: 'kMaxResBlockFusingChannels' expression was replaced
+      with a value. Modify the code to use the original expression, provided
+      in comments, if it is correct.
+      */
+      sycl::local_accessor<float, 1> shared_data_acc_ct1(
+          sycl::range<1>(384 /*kMaxResBlockFusingChannels*/), cgh);
+      /*
+      DPCT1101:129: 'kMaxResBlockFusingChannels / 32' expression was
+      replaced with a value. Modify the code to use the original expression,
+      provided in comments, if it is correct.
+      */
+      /*
+      DPCT1101:130: 'kMaxResBlockFusingSeK' expression was replaced with a
+      value. Modify the code to use the original expression, provided in
+      comments, if it is correct.
+      */
+      sycl::local_accessor<float, 2> shared_sums_acc_ct1(
+          sycl::range<2>(12 /*kMaxResBlockFusingChannels / 32*/,
+                         128 /*kMaxResBlockFusingSeK*/),
+          cgh);
+
+      cgh.parallel_for(
+          sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
+                            sycl::range<3>(1, 1, C)),
+          [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(SYCL_SUB_GROUP_SIZE)]] {
+            OutputTransform_SE_relu_InputTransform_kernel<
+                sycl::half, activation, use_bias, use_skip>(
+                N, C, se_K, output, input, (sycl::half*)skip, bias, w1, b1, w2,
+                b2, item_ct1, shared_data_acc_ct1.get_pointer(),
+                shared_sums_acc_ct1);
+          });
+    });
+  }
+}
+
+template void FilterTransform<sycl::half>(int N, int C, sycl::half* transformedFilter,
+                                    const sycl::half* filter, sycl::queue &sycl_queue);
+
+template void InputTransform<sycl::half, true>(int N, int C, sycl::half* transformed_input,
+                                         const sycl::half* input, sycl::queue &sycl_queue);
+
+template void InputTransform<sycl::half, false>(int N, int C, sycl::half* transformed_input,
+                                          const sycl::half* input, sycl::queue &sycl_queue);
+
+template void OutputTransform<sycl::half, true, ACTIVATION_RELU, true, true, false,
+                              false>(int N, int C, int se_K, sycl::half* output,
+                                     const sycl::half* input, const sycl::half* skip,
+                                     const sycl::half* bias, const sycl::half* w1,
+                                     const sycl::half* b1, const sycl::half* w2,
+                                     const sycl::half* b2, sycl::queue &sycl_queue);
+
+template void OutputTransform<sycl::half, false, ACTIVATION_RELU, true, true, false,
+                              false>(int N, int C, int se_K, sycl::half* output,
+                                     const sycl::half* input, const sycl::half* skip,
+                                     const sycl::half* bias, const sycl::half* w1,
+                                     const sycl::half* b1, const sycl::half* w2,
+                                     const sycl::half* b2, sycl::queue &sycl_queue);
+
+template void OutputTransform<sycl::half, true, ACTIVATION_RELU, true, true, true,
+                              false>(int N, int C, int se_K, sycl::half* output,
+                                     const sycl::half* input, const sycl::half* skip,
+                                     const sycl::half* bias, const sycl::half* w1,
+                                     const sycl::half* b1, const sycl::half* w2,
+                                     const sycl::half* b2, sycl::queue &sycl_queue);
+
+template void OutputTransform<sycl::half, false, ACTIVATION_RELU, true, true, true,
+                              false>(int N, int C, int se_K, sycl::half* output,
+                                     const sycl::half* input, const sycl::half* skip,
+                                     const sycl::half* bias, const sycl::half* w1,
+                                     const sycl::half* b1, const sycl::half* w2,
+                                     const sycl::half* b2, sycl::queue &sycl_queue);
+
+template void OutputTransform<sycl::half, false, ACTIVATION_RELU, true, false, false,
+                              false>(int N, int C, int se_K, sycl::half* output,
+                                     const sycl::half* input, const sycl::half* skip,
+                                     const sycl::half* bias, const sycl::half* w1,
+                                     const sycl::half* b1, const sycl::half* w2,
+                                     const sycl::half* b2, sycl::queue &sycl_queue);
+
+template void OutputTransform<sycl::half, false, ACTIVATION_RELU, true, false, false,
+                              true>(int N, int C, int se_K, sycl::half* output,
+                                    const sycl::half* input, const sycl::half* skip,
+                                    const sycl::half* bias, const sycl::half* w1,
+                                    const sycl::half* b1, const sycl::half* w2,
+                                    const sycl::half* b2, sycl::queue &sycl_queue);
+
+template void OutputTransform<sycl::half, true, ACTIVATION_RELU, true, true, true,
+                              true>(int N, int C, int se_K, sycl::half* output,
+                                    const sycl::half* input, const sycl::half* skip,
+                                    const sycl::half* bias, const sycl::half* w1,
+                                    const sycl::half* b1, const sycl::half* w2,
+                                    const sycl::half* b2, sycl::queue &sycl_queue);
+
+template void OutputTransform<sycl::half, true, ACTIVATION_MISH, true, true, false,
+                              false>(int N, int C, int se_K, sycl::half* output,
+                                     const sycl::half* input, const sycl::half* skip,
+                                     const sycl::half* bias, const sycl::half* w1,
+                                     const sycl::half* b1, const sycl::half* w2,
+                                     const sycl::half* b2, sycl::queue &sycl_queue);
+
+template void OutputTransform<sycl::half, false, ACTIVATION_MISH, true, true, false,
+                              false>(int N, int C, int se_K, sycl::half* output,
+                                     const sycl::half* input, const sycl::half* skip,
+                                     const sycl::half* bias, const sycl::half* w1,
+                                     const sycl::half* b1, const sycl::half* w2,
+                                     const sycl::half* b2, sycl::queue &sycl_queue);
+
+template void OutputTransform<sycl::half, true, ACTIVATION_MISH, true, true, true,
+                              false>(int N, int C, int se_K, sycl::half* output,
+                                     const sycl::half* input, const sycl::half* skip,
+                                     const sycl::half* bias, const sycl::half* w1,
+                                     const sycl::half* b1, const sycl::half* w2,
+                                     const sycl::half* b2, sycl::queue &sycl_queue);
+
+template void OutputTransform<sycl::half, false, ACTIVATION_MISH, true, true, true,
+                              false>(int N, int C, int se_K, sycl::half* output,
+                                     const sycl::half* input, const sycl::half* skip,
+                                     const sycl::half* bias, const sycl::half* w1,
+                                     const sycl::half* b1, const sycl::half* w2,
+                                     const sycl::half* b2, sycl::queue &sycl_queue);
+
+template void OutputTransform<sycl::half, false, ACTIVATION_MISH, true, false, false,
+                              false>(int N, int C, int se_K, sycl::half* output,
+                                     const sycl::half* input, const sycl::half* skip,
+                                     const sycl::half* bias, const sycl::half* w1,
+                                     const sycl::half* b1, const sycl::half* w2,
+                                     const sycl::half* b2, sycl::queue &sycl_queue);
+
+template void OutputTransform<sycl::half, false, ACTIVATION_MISH, true, false, false,
+                              true>(int N, int C, int se_K, sycl::half* output,
+                                    const sycl::half* input, const sycl::half* skip,
+                                    const sycl::half* bias, const sycl::half* w1,
+                                    const sycl::half* b1, const sycl::half* w2,
+                                    const sycl::half* b2, sycl::queue &sycl_queue);
+
+template void OutputTransform<sycl::half, true, ACTIVATION_MISH, true, true, true,
+                              true>(int N, int C, int se_K, sycl::half* output,
+                                    const sycl::half* input, const sycl::half* skip,
+                                    const sycl::half* bias, const sycl::half* w1,
+                                    const sycl::half* b1, const sycl::half* w2,
+                                    const sycl::half* b2, sycl::queue &sycl_queue);
+
+template void OutputTransform<sycl::half, false, ACTIVATION_NONE, true, false, false,
+                              false>(int N, int C, int se_K, sycl::half* output,
+                                     const sycl::half* input, const sycl::half* skip,
+                                     const sycl::half* bias, const sycl::half* w1,
+                                     const sycl::half* b1, const sycl::half* w2,
+                                     const sycl::half* b2, sycl::queue &sycl_queue);
+
+template void OutputInputTransform<sycl::half, true, ACTIVATION_RELU, true, true>(
+    int N, int C, int se_K, sycl::half* output, const sycl::half* input, const sycl::half* skip,
+    const sycl::half* bias, const sycl::half* w1, const sycl::half* b1, const sycl::half* w2,
+    const sycl::half* b2, sycl::queue &sycl_queue);
+
+template void OutputInputTransform<sycl::half, false, ACTIVATION_RELU, true, true>(
+    int N, int C, int se_K, sycl::half* output, const sycl::half* input, const sycl::half* skip,
+    const sycl::half* bias, const sycl::half* w1, const sycl::half* b1, const sycl::half* w2,
+    const sycl::half* b2, sycl::queue &sycl_queue);
+
+template void OutputInputTransform<sycl::half, false, ACTIVATION_RELU, true, false>(
+    int N, int C, int se_K, sycl::half* output, const sycl::half* input, const sycl::half* skip,
+    const sycl::half* bias, const sycl::half* w1, const sycl::half* b1, const sycl::half* w2,
+    const sycl::half* b2, sycl::queue &sycl_queue);
+
+template void OutputInputTransform<sycl::half, true, ACTIVATION_MISH, true, true>(
+    int N, int C, int se_K, sycl::half* output, const sycl::half* input, const sycl::half* skip,
+    const sycl::half* bias, const sycl::half* w1, const sycl::half* b1, const sycl::half* w2,
+    const sycl::half* b2, sycl::queue &sycl_queue);
+
+template void OutputInputTransform<sycl::half, false, ACTIVATION_MISH, true, true>(
+    int N, int C, int se_K, sycl::half* output, const sycl::half* input, const sycl::half* skip,
+    const sycl::half* bias, const sycl::half* w1, const sycl::half* b1, const sycl::half* w2,
+    const sycl::half* b2, sycl::queue &sycl_queue);
+
+template void OutputInputTransform<sycl::half, false, ACTIVATION_MISH, true, false>(
+    int N, int C, int se_K, sycl::half* output, const sycl::half* input, const sycl::half* skip,
+    const sycl::half* bias, const sycl::half* w1, const sycl::half* b1, const sycl::half* w2,
+    const sycl::half* b2, sycl::queue &sycl_queue);
+
+}  // namespace sycldnn_backend
+}  // namespace lczero
diff --git a/src/neural/backends/sycl/inputs_outputs.h b/src/neural/backends/sycl/inputs_outputs.h
new file mode 100644
index 0000000000..3d9cf38705
--- /dev/null
+++ b/src/neural/backends/sycl/inputs_outputs.h
@@ -0,0 +1,116 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2018-2024 The LCZero Authors
+  Copyright (C) 2023 Intel Corporation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+   
+  SPDX-License-Identifier:GNU General Public License v3.0 or later
+*/
+
+#include <sycl/sycl.hpp>
+#include "neural/network.h"
+#include "cuBlasContext.h"
+
+namespace lczero {
+namespace sycldnn_backend {
+
+struct InputsOutputs {
+  InputsOutputs(int maxBatchSize, bool wdl, bool moves_left, sycl::queue& m_ct1,
+                size_t tensor_mem_size = 0, size_t scratch_size = 0,
+                bool cublasDisableTensorCores = false): q_ct1(m_ct1) {
+  #ifdef USE_CUBLAS
+    cublasHandle_t h= cuBlasContextManager::getcuBlasHandle_t();
+  #endif                
+    input_masks_mem_shared_ = malloc_host<uint64_t>(maxBatchSize * kInputPlanes, q_ct1);
+    input_val_mem_shared_ = malloc_host<float>(maxBatchSize * kInputPlanes, q_ct1);
+    // Seperate device memory copy for policy output.
+    // It's faster to write to device memory and then copy to host memory
+    // than having the kernel write directly to it.
+    op_policy_mem_ = malloc_host<float>(maxBatchSize * kNumOutputPolicy, q_ct1);
+    op_policy_mem_gpu_ = malloc_device<float>(maxBatchSize * kNumOutputPolicy, q_ct1);
+    op_value_mem_shared_ = malloc_host<float>(maxBatchSize * (wdl ? 3 : 1), q_ct1);
+
+    if (moves_left) {
+      op_moves_left_mem_shared_ = malloc_host<float>(maxBatchSize, q_ct1);
+    }
+
+    // memory for network execution managed inside this structure
+    if (tensor_mem_size) {
+      multi_stream_ = true;
+      scratch_mem_ = (void*)sycl::malloc_device( scratch_size, q_ct1);
+      for (auto& mem : tensor_mem_) {
+        mem = (void*)sycl::malloc_device(tensor_mem_size, q_ct1);
+        q_ct1.memset(mem, 0, tensor_mem_size);
+      }
+    } else {
+      multi_stream_ = false;
+    }
+  }
+
+
+  ~InputsOutputs() {
+    /*
+    sycl::free(input_masks_mem_shared_, q_ct1);
+    sycl::free(input_val_mem_shared_, q_ct1);
+    sycl::free(op_value_mem_shared_, q_ct1);
+    if (op_moves_left_mem_shared_ != nullptr)
+        sycl::free(op_moves_left_mem_shared_, q_ct1);
+    sycl::free(op_policy_mem_gpu_, q_ct1);
+
+    if (multi_stream_) {
+      for (auto mem : tensor_mem_) {
+        if (mem) 
+            sycl::free(mem, q_ct1);
+      }
+      if (scratch_mem_) 
+          sycl::free(scratch_mem_, q_ct1);
+      if (offset_pointers_) 
+          sycl::free(offset_pointers_, q_ct1);
+      if (head_offset_pointers_) {
+          sycl::free(head_offset_pointers_, q_ct1);
+      } 
+      //dpct::get_current_device().destroy_queue(stream_);
+      //cublas_ = nullptr;
+    } */
+  }
+  uint64_t* input_masks_mem_shared_;
+  float* input_val_mem_shared_;
+  float* op_value_mem_shared_;
+  float* op_moves_left_mem_shared_ = nullptr;
+
+  // GPU pointers for the above allocations.
+  //uint64_t* input_masks_mem_gpu_;
+  //float* input_val_mem_gpu_;
+  //float* op_value_mem_gpu_;
+  //float* op_moves_left_mem_gpu_;
+
+  // This is a seperate copy.
+  float* op_policy_mem_gpu_;
+  float* op_policy_mem_;
+
+  // memory needed to run the network owned by InputsOutputs when multi_stream
+  // is enabled
+  bool multi_stream_;
+  void* tensor_mem_[3];
+  void* scratch_mem_;
+  void** offset_pointers_ = nullptr;
+  void** head_offset_pointers_ = nullptr;
+
+  // cuda stream used to run the network
+  sycl::queue& q_ct1;
+};
+
+}  // namespace sycldnn_backend
+}  // namespace lczero
diff --git a/src/neural/backends/sycl/kernels.h b/src/neural/backends/sycl/kernels.h
new file mode 100644
index 0000000000..2330cae9f5
--- /dev/null
+++ b/src/neural/backends/sycl/kernels.h
@@ -0,0 +1,153 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2018-2024 The LCZero Authors
+  Copyright (C) 2023 Intel Corporation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+   
+  SPDX-License-Identifier:GNU General Public License v3.0 or later
+*/
+
+#include <sycl/sycl.hpp>
+#include "sycl_common.h"
+#include "neural/backends/shared/activation.h"
+
+namespace lczero {
+namespace sycldnn_backend {
+
+// Adds two vectors (possibly of different sizes), also do optional
+// activation (relu, tanh or sigmoid).
+template <typename T>
+void addVectors(T* c, T* a, T* b, int size, int asize, int bsize,
+                ActivationFunction activation, sycl::queue &sycl_queue);
+
+// Adds two vectors of equal size overwriting the first with the sum.
+// This specialisation performs a transposition of the first 2 indexes
+// of the second while performing the addition.
+template <typename T>
+void addVectorsHNC_NHC(T* a, T* b, int N, int H, int C, sycl::queue &sycl_queue);
+
+// Optimized kernel to add bias to innermost dimension
+// and perform optional activation (to be used with GEMMs/fully connected)
+template <typename T>
+void addBiasBatched(T* output, const T* input, const T* bias, int Batch, int N,
+                    int C, ActivationFunction activation, sycl::queue &sycl_queue);
+
+// Optimized kernel to add bias to innermost dimension
+// and perform optional activation (to be used with GEMMs/fully connected)
+template <typename T>
+void addBiasBatched(T* output, const T* input, const T* bias, int Batch, int N,
+                    int C, int Nstride, ActivationFunction activation, sycl::queue &sycl_queue);
+
+// Add bias to convolution's output.
+template <typename T>
+void addBias_NCHW(T* c, T* a, T* b, int N, int C, int H, int W,
+                  ActivationFunction activation, sycl::queue &sycl_queue);
+
+// Conversion from NCHW to NHWC, can also change datatype depending on template
+// params, also pad/un-pad elements from Batch or Channel dimensions
+template <typename DstType, typename SrcType>
+void convertNCHWtoNHWC(DstType* output_tensor, const SrcType* input_tensor,
+                       int Nin, int Cin, int Nout, int Cout, int H, int W, sycl::queue &sycl_queue);
+
+// Plain data-type conversion (no layout conversion).
+template <typename DstType, typename SrcType>
+void copyTypeConverted(DstType* op, SrcType* ip, int N, sycl::queue &sycl_queue);
+
+// Perform batch normilization.
+template <typename T>
+void batchNorm(T* output, const T* input, const T* skipInput, int N, int C,
+               int H, int W, float* means, float* var_multipliers,
+               ActivationFunction activation, sycl::queue &sycl_queue);
+
+// Unpack planes (input to network).
+void expandPlanes_Fp32_NCHW(float* output, const uint64_t* masks,
+                            const float* values, int n, sycl::queue &sycl_queue);
+
+void expandPlanes_Fp16_NHWC(sycl::half* output, const uint64_t* masks,
+                            const float* values, int n, sycl::queue &sycl_queue);
+
+void expandPlanes_Fp16_NCHW(sycl::half* output, const uint64_t* masks,
+                            const float* values, int n, sycl::queue &sycl_queue);
+
+// Perform global avg pool.
+template <typename T>
+void globalAvgPool(int N, int C, T* output, const T* input,
+                   const T* prevLayerBias, bool nhwc, sycl::queue &sycl_queue);
+
+// Perform global scale.
+template <typename T>
+void globalScale(int N, int C, T* output, const T* input, const T* scaleBias,
+                 const T* prevLayerBias, bool nhwc,
+                 ActivationFunction activation, sycl::queue &sycl_queue);
+
+// Perform Squeeze-and-Excitation (SE) in a single fused kernel.
+// Returns false if the fused kernel can't handle the sizes.
+bool Se_Fp16_NHWC(int N, int C, int numFc1Out, sycl::half* output,
+                  const sycl::half* skip, const sycl::half* input,
+                  const sycl::half* w1, const sycl::half* b1,
+                  const sycl::half* w2, const sycl::half* b2,
+                  const sycl::half* bPrev, ActivationFunction activation, sycl::queue &sycl_queue);
+
+template <typename T>
+void PolicyMap(int N, T* output, const T* input, const short* indices,
+               int inputSize, int usedSize, int outputSize, sycl::queue &sycl_queue);
+
+// Custom winograd helper functions
+template <typename T>
+void FilterTransform(int N, int C, T* transformedFilter, const T* filter, sycl::queue &sycl_queue);
+
+template <typename T, bool nhcw>
+void InputTransform(int N, int C, T* transformedInput, const T* input, sycl::queue &sycl_queue);
+
+template <typename T, bool use_se, ActivationFunction activation, bool use_bias,
+          bool use_skip, bool skipInput_nhcw, bool output_nhcw>
+void OutputTransform(int N, int C, int se_K, T* output, const T* input,
+                     const T* skip, const T* bias, const T* w1, const T* b1,
+                     const T* w2, const T* b2, sycl::queue &sycl_queue);
+
+template <typename T, bool use_se, ActivationFunction activation, bool use_bias,
+          bool use_skip>
+void OutputInputTransform(int N, int C, int se_K, T* output, const T* input,
+                          const T* skip, const T* bias, const T* w1,
+                          const T* b1, const T* w2, const T* b2, sycl::queue &sycl_queue);
+
+template <typename T>
+void Softmax(int N, int C, T* output, const T* input, const T* input2, sycl::queue &sycl_queue);
+
+template <typename T>
+void LayerNorm(int N, int C, T* output, const T* input, const T* bias,
+               const T* skip, const T* gammas, const T* betas, float ep,
+               float alpha, ActivationFunction act, sycl::queue &sycl_queue);
+
+template <typename T>
+void ComputePromotionLogits(int N, int C, T* output, const T* keys,
+                            const T* ppo, const T* policy_attn_logits, sycl::queue &sycl_queue);
+
+template <typename T>
+void inputPreprocessForAttentionBody(T* output, const T* input,
+                                     const T* encoding, int N, int input_size,
+                                     int encoding_size,
+                                     bool is_pe_dense_embedding,
+                                     sycl::queue &sycl_queue);
+
+template <typename T>
+void applyInputGating(T* output, const T* input, const T* mult, const T* add,
+                      int N, int HW, int C, sycl::queue &sycl_queue);
+
+template <typename T>
+void genOffsetPointers(T** offsets, int heads, int max_batch, int depth,
+                       int d_model, T* k, T* q, T* b1, T* v, T* b2, sycl::queue &sycl_queue);
+}  // namespace sycldnn_backend
+}  // namespace lczero
diff --git a/src/neural/backends/sycl/layers.cc.dp.cpp b/src/neural/backends/sycl/layers.cc.dp.cpp
new file mode 100644
index 0000000000..8a046ab292
--- /dev/null
+++ b/src/neural/backends/sycl/layers.cc.dp.cpp
@@ -0,0 +1,2830 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2018-2024 The LCZero Authors
+  Copyright (C) 2023 Intel Corporation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+   
+  SPDX-License-Identifier:GNU General Public License v3.0 or later
+*/
+
+#include <sycl/sycl.hpp>
+#include "layers.h"
+
+#include <cassert>
+#include <cstring>
+#include <vector>
+
+#ifdef USE_HIPBLAS 
+#include "hipblas/hipblas.h"
+#include "cuBlasContext.h"
+#elif defined(USE_CUBLAS)
+#include <sycl/backend/cuda.hpp>
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "cuBlasContext.h"
+#else
+#include "oneapi/mkl.hpp"
+#include "oneapi/mkl/blas.hpp"
+#endif
+
+#include "sycl_common.h"
+#include "kernels.h"
+#include "neural/network.h"
+#include "neural/tables/attention_policy_map.h"
+#include "utils/fp16_utils.h"
+
+#include <cmath>
+
+
+#ifdef USE_HIPBLAS
+#if hipblasVersionMajor < 3
+#define HIPBLAS_COMPUTE_16F HIPBLAS_R_16F
+#define HIPBLAS_COMPUTE_32F HIPBLAS_R_32F
+#endif
+#define transpose_type hipblasOperation_t 
+#define transpose_type_transpose HIPBLAS_OP_T  
+#define transpose_type_notranspose HIPBLAS_OP_N 
+#elif defined(USE_CUBLAS)
+#define transpose_type cublasOperation_t 
+#define transpose_type_transpose CUBLAS_OP_T  
+#define transpose_type_notranspose CUBLAS_OP_N 
+#else
+#define transpose_type oneapi::mkl::transpose 
+#define transpose_type_transpose oneapi::mkl::transpose::trans
+#define transpose_type_notranspose oneapi::mkl::transpose::nontrans
+#endif
+
+
+namespace lczero {
+namespace sycldnn_backend {
+
+// Use Single kernel for entire SE operation.
+// Right now supported only for fp16 with nhwc and it's quite a bit faster
+// than using multiple passes. The flag can be set to false for debugging.
+static constexpr bool kUseFusedSELayer = true;
+
+template <typename DataType>
+BaseLayer<DataType>::BaseLayer(int c, int h, int w, BaseLayer* ip, bool nhwc,
+                               sycl::queue& sycl_queue)
+    : input_(ip), C(c), H(h), W(w), nhwc_(nhwc), sycl_queue_(sycl_queue) {}
+
+template <typename DataType>
+BaseLayer<DataType>::BaseLayer(int c, int h, int w, BaseLayer* ip, sycl::queue& sycl_queue)
+    : input_(ip),
+      C(c),
+      H(h),
+      W(w),
+      nhwc_(ip ? ip->nhwc_ : false),
+      sycl_queue_(sycl_queue) {}
+
+template <typename DataType>
+SELayer<DataType>::SELayer(BaseLayer<DataType>* ip, int fc1Outputs,
+                           bool addPrevLayerBias, ActivationFunction activation, sycl::queue &sycl_queue)
+    : BaseLayer<DataType>(ip->GetC(), ip->GetH(), ip->GetW(), ip, sycl_queue),
+      numFc1Out_(fc1Outputs),
+      addPrevLayerBias_(addPrevLayerBias),
+      act_(activation) {
+  w1_ = (DataType*)sycl::malloc_device(C * numFc1Out_ * sizeof(DataType),
+                                       sycl_queue_);
+  w2_ = (DataType*)sycl::malloc_device(2 * C * numFc1Out_ * sizeof(DataType),
+                                       sycl_queue_);
+
+  if (kUseFusedSELayer && nhwc_) {
+    w1_t_ = (DataType*)sycl::malloc_device(C * numFc1Out_ * sizeof(DataType),
+                                           sycl_queue_);
+    w2_t_ = (DataType*)sycl::malloc_device(2 * C * numFc1Out_ * sizeof(DataType),
+                                           sycl_queue_);
+  }
+
+  b1_ = (DataType*)sycl::malloc_device(numFc1Out_ * sizeof(DataType),
+                                       sycl_queue_);
+  b2_ = (DataType*)sycl::malloc_device(2 * C * sizeof(DataType), sycl_queue_);
+
+  bPrev_ = (DataType*)sycl::malloc_device(C * sizeof(DataType), sycl_queue_);
+}
+
+template <typename DataType>
+SELayer<DataType>::~SELayer() {
+  sycl::free(w1_, sycl_queue_);
+  sycl::free(w2_, sycl_queue_);
+  sycl::free(b1_, sycl_queue_);
+  sycl::free(b2_, sycl_queue_);
+  sycl::free(bPrev_, sycl_queue_);
+}
+
+template <>
+void SELayer<float>::LoadWeights(float* w1, float* b1, float* w2, float* b2,
+                                 float* prevLayerBias, void* /*scratch*/) {
+  const size_t num_weights1 = C * numFc1Out_;
+  const size_t weight_size1 = sizeof(float) * num_weights1;
+
+  const size_t weight_size2 = 2 * weight_size1;
+
+  // Weight for the first FC layer.
+  sycl_queue_.memcpy(w1_, w1, weight_size1);
+
+  // Weight for the second FC layer.
+  sycl_queue_.memcpy(w2_, w2, weight_size2);
+
+  // Bias for the first FC layer.
+  sycl_queue_.memcpy(b1_, b1, numFc1Out_ * sizeof(float));
+
+  // Bias for the second FC layer.
+  sycl_queue_.memcpy(b2_, b2, 2 * C * sizeof(float));
+
+  // Bias for previous layer (Convolution).
+  if (prevLayerBias) {
+    sycl_queue_.memcpy(bPrev_, prevLayerBias, C * sizeof(float));
+  }
+
+  sycl_queue_.wait();
+}
+
+void cpuTranspose(float* op, float* ip, int rows, int cols) {
+  for (int i = 0; i < rows; i++)
+    for (int j = 0; j < cols; j++) op[j * rows + i] = ip[i * cols + j];
+}
+
+template <>
+void SELayer<sycl::half>::LoadWeights(float* w1, float* b1, float* w2, float* b2,
+                                float* prevLayerBias, void* scratch) {
+  const size_t num_weights1 = C * numFc1Out_;
+  size_t weight_size1 = sizeof(float) * num_weights1;
+
+  const size_t num_weights2 = 2 * num_weights1;
+  size_t weight_size2 = 2 * weight_size1;
+
+  // Transpose the weight matrices for the fused path.
+  std::vector<float> temp(weight_size2);
+
+  // Weight for the first FC layer.
+ 
+  sycl_queue_.memcpy(scratch, w1, weight_size1).wait();
+  
+  copyTypeConverted((sycl::half*)w1_, (float*)scratch, (int)num_weights1, sycl_queue_);
+
+  if (kUseFusedSELayer && nhwc_) {
+    // transposed copy for fused SE kernel
+    cpuTranspose(temp.data(), w1, numFc1Out_, C);
+    
+    sycl_queue_.memcpy(scratch, temp.data(), weight_size1).wait();    
+    
+    copyTypeConverted((sycl::half*)w1_t_, (float*)scratch, (int)num_weights1, sycl_queue_);
+  }
+
+  // Weight for the second FC layer.
+  sycl_queue_.memcpy(scratch, w2, weight_size2).wait();
+  
+  copyTypeConverted((sycl::half*)w2_, (float*)scratch, (int)num_weights2, sycl_queue_);
+  if (kUseFusedSELayer && nhwc_) {
+    cpuTranspose(temp.data(), w2, 2 * C, numFc1Out_);
+    
+    sycl_queue_.memcpy(scratch, temp.data(), weight_size2).wait();
+    copyTypeConverted((sycl::half*)w2_t_, (float*)scratch, (int)num_weights2, sycl_queue_);
+  }
+
+  // Bias for the first FC layer.
+    
+  sycl_queue_.memcpy(scratch, b1, numFc1Out_ * sizeof(float)).wait();
+  
+  copyTypeConverted((sycl::half*)b1_, (float*)scratch, numFc1Out_, sycl_queue_);
+
+  // Bias for the second FC layer.
+  sycl_queue_.memcpy(scratch, b2, 2 * C * sizeof(float)).wait();
+
+  copyTypeConverted((sycl::half*)b2_, (float*)scratch, 2 * C, sycl_queue_);
+
+  // Bias for previous layer (Convolution).
+  if (prevLayerBias) {
+    
+    sycl_queue_.memcpy(scratch, prevLayerBias, C * sizeof(float)).wait();
+    copyTypeConverted((sycl::half*)bPrev_, (float*)scratch, C, sycl_queue_);
+  }
+
+} 
+
+template <>
+void SELayer<float>::Eval(int N, float* output, const float* input,
+                          const float* /*input2*/, void* scratch,
+                          size_t scratch_size, sycl::queue &sycl_queue, float***) {
+
+  //CERR << "SELayer<float>::Eval. ";                          
+  // Ping-pong between 'op1' and 'op2' (parts of scratch memory).
+  float* op1 = (float*)scratch;
+  float* op2 = (float*)scratch + scratch_size / sizeof(float) / 2;
+
+  // 1. Global avg pooling (also adds previous layer bias before computing
+  // averages).
+  globalAvgPool(N, C, op2, input, bPrev_, false, sycl_queue);
+
+  // 2. First fully connected layer.
+  float alpha = 1.0f, beta = 0.0f;
+
+  #ifdef USE_CUBLAS
+  cublasHandle_t handle = cuBlasContextManager::getcuBlasHandle_t();
+
+  sycl_queue.submit([&](sycl::handler &cgh) {
+        //auto d_A = b_A.get_access<sycl::access::mode::read_write>(cgh);
+        
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+
+        auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
+        cublasSetStream(handle, cudaStreamHandle);  
+
+        ReportCUBLASErrors(cublasSgemm(handle, transpose_type_transpose, transpose_type_notranspose, numFc1Out_,
+                                 N, C, &alpha, w1_, C, op2, C, &beta, op1,
+                                 numFc1Out_));
+
+        });
+  });
+  #elif defined(USE_HIPBLAS)
+  hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t();
+
+  sycl_queue.submit([&](sycl::handler &cgh) {
+        //auto d_A = b_A.get_access<sycl::access::mode::read_write>(cgh);
+        
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+        auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
+
+        hipblasSetStream(handle, hipStreamHandle);  
+
+        hipblasSgemm(handle, transpose_type_transpose, transpose_type_notranspose, numFc1Out_,
+                                 N, C, &alpha, w1_, C, op2, C, &beta, op1,
+                                 numFc1Out_);
+        });
+  });  
+  #else
+  
+  oneapi::mkl::blas::column_major::gemm(sycl_queue, transpose_type_transpose,
+        transpose_type_notranspose, numFc1Out_, N, C, alpha, w1_, C, op2,
+        C, beta, op1, numFc1Out_);
+
+  #endif
+
+  addVectors(op1, b1_, op1, numFc1Out_ * N, numFc1Out_, numFc1Out_ * N, act_, sycl_queue);
+
+  // 3. Second fully connected layer.
+
+  #ifdef USE_CUBLAS
+  sycl_queue.submit([&](sycl::handler &cgh) {
+        
+        
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+
+        auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
+        cublasSetStream(handle, cudaStreamHandle);  
+
+        ReportCUBLASErrors(cublasSgemm(handle, transpose_type_transpose, transpose_type_notranspose, 2 * C, N,
+                                 numFc1Out_, &alpha, w2_, numFc1Out_, op1,
+                                 numFc1Out_, &beta, op2, 2 * C));
+
+        });
+  });
+
+  #elif defined(USE_HIPBLAS)
+  sycl_queue.submit([&](sycl::handler &cgh) {
+        
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+        auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
+
+        hipblasSetStream(handle, hipStreamHandle);  
+
+        hipblasSgemm(handle, transpose_type_transpose, transpose_type_notranspose, 2 * C, N,
+                                 numFc1Out_, &alpha, w2_, numFc1Out_, op1,
+                                 numFc1Out_, &beta, op2, 2 * C);
+
+        
+        });
+  });
+  #else
+    oneapi::mkl::blas::column_major::gemm(sycl_queue, transpose_type_transpose,
+        transpose_type_notranspose, 2 * C, N, numFc1Out_, alpha, w2_,
+        numFc1Out_, op1, numFc1Out_, beta, op2, 2 * C);
+  #endif
+
+  addVectors(op2, b2_, op2, 2 * C * N, 2 * C, 2 * C * N, ACTIVATION_NONE, sycl_queue);
+
+  // 4. (Optional prev layer bias add), Global scale, residual add, relu and
+  // bias.
+  globalScale(N, C, output, input, op2, bPrev_, false, act_, sycl_queue);
+
+}
+
+template <>
+void SELayer<sycl::half>::Eval(int N, sycl::half* output, const sycl::half* input,
+                         const sycl::half* input2, void* scratch, size_t scratch_size, sycl::queue &sycl_queue, sycl::half***) {
+  //CERR << "SELayer<sycl::half>::Eval. ";
+
+  bool se_done = false;
+  if (kUseFusedSELayer && nhwc_) {
+    se_done = Se_Fp16_NHWC(N, C, numFc1Out_, output, input2, input, w1_t_, b1_,
+                           w2_t_, b2_, bPrev_, act_, sycl_queue);
+  }
+  if (!se_done) {
+    assert(output == input2);
+    // Ping-pong between 'op1' and 'op2' (parts of scratch memory).
+    sycl::half* op1 = (sycl::half*)scratch;
+    sycl::half* op2 = (sycl::half*)scratch + scratch_size / sizeof(sycl::half) / 2;
+
+    // 1. Global avg pooling (also adds previous layer bias before computing
+    // averages).
+    globalAvgPool(N, C, op2, input, bPrev_, nhwc_, sycl_queue);
+
+    // 2. First fully connected layer.
+    //half_raw one_h{0x3C00};
+    //half_raw zero_h{0};
+
+    #ifdef USE_CUBLAS
+    __half_raw one_h{0x3C00};
+    __half_raw zero_h{0};
+    half alpha = one_h;
+    half beta = zero_h;
+
+    #elif defined(USE_HIPBLAS)
+    hipblasHalf alpha{1};
+    hipblasHalf beta{0};
+
+    #else
+    sycl::half alpha = 1;
+    sycl::half beta = 0;
+    #endif
+
+    #ifdef USE_CUBLAS
+  
+    cublasHandle_t handle = cuBlasContextManager::getcuBlasHandle_t();
+
+    sycl_queue.submit([&](sycl::handler &cgh) {
+       
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+
+        auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
+        cublasSetStream(handle, cudaStreamHandle);  
+    
+        ReportCUBLASErrors(cublasHgemm(handle, transpose_type_transpose, transpose_type_notranspose, numFc1Out_,
+                                   N, C, &alpha, ((const half *)w1_), C, ((const half *)op2), C, &beta, ((half *)op1),
+                                   numFc1Out_));
+    
+        });
+    });
+
+#elif defined(USE_HIPBLAS)
+    hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t();
+
+    sycl_queue.submit([&](sycl::handler &cgh) {
+      cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+        auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
+
+        hipblasSetStream(handle, hipStreamHandle);
+
+        hipblasHgemm(handle, transpose_type_transpose,
+                     transpose_type_notranspose,numFc1Out_, N, C, &alpha,
+                     ((const hipblasHalf *)w1_), C, ((const hipblasHalf *)op2), C,
+                     &beta, ((hipblasHalf *)op1), numFc1Out_);
+
+      });
+    });
+#else
+    oneapi::mkl::blas::column_major::gemm(
+        sycl_queue, transpose_type_transpose, transpose_type_notranspose,
+        numFc1Out_, N, C, alpha, ((const sycl::half *)w1_), C,
+        ((const sycl::half *)op2),C, beta, ((sycl::half *)op1), numFc1Out_);
+#endif
+
+    addVectors(op1, b1_, op1, numFc1Out_ * N, numFc1Out_, numFc1Out_ * N, act_, sycl_queue);
+
+    #ifdef USE_CUBLAS
+
+    sycl_queue_.submit([&](sycl::handler &cgh) {
+        
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+
+        auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
+        cublasSetStream(handle, cudaStreamHandle);   
+    
+        // 3. Second fully connected layer.
+        ReportCUBLASErrors(cublasHgemm(handle, transpose_type_transpose, transpose_type_notranspose, 2 * C, N,
+                                   numFc1Out_, &alpha, ((const half *)w2_), numFc1Out_, ((const half *)op1),
+                                   numFc1Out_, &beta, ((half *)op2), 2 * C));
+  
+        });
+    });  
+    
+#elif defined(USE_HIPBLAS)
+    sycl_queue.submit([&](sycl::handler &cgh) {
+      cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+        auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
+        hipblasSetStream(handle, hipStreamHandle);
+
+        hipblasHgemm(
+            handle, transpose_type_transpose, transpose_type_notranspose, 2 * C,
+            N, numFc1Out_, &alpha,((const hipblasHalf *)w2_), numFc1Out_,
+            ((const hipblasHalf *)op1), numFc1Out_, &beta, ((hipblasHalf *)op2),
+            2 * C);
+
+      });
+    });
+#else
+    oneapi::mkl::blas::column_major::gemm(
+        sycl_queue, transpose_type_transpose, transpose_type_notranspose, 2 * C,
+        N, numFc1Out_, alpha, ((const sycl::half *)w2_), numFc1Out_,
+        ((const sycl::half *)op1), numFc1Out_, beta, ((sycl::half *)op2),
+        2 * C);
+#endif
+    
+    addVectors(op2, b2_, op2, 2 * C * N, 2 * C, 2 * C * N, ACTIVATION_NONE, sycl_queue);
+
+    // 4. (Optional prev layer bias add), Global scale, residual add, relu and
+    // bias.
+    globalScale(N, C, output, input, op2, bPrev_, nhwc_, act_, sycl_queue);
+  }
+} 
+
+template <typename DataType>
+FCLayer<DataType>::FCLayer(BaseLayer<DataType>* ip, int C, int H, int W,
+                           bool bias, ActivationFunction activation, sycl::queue &sycl_queue)
+    : BaseLayer<DataType>(C, H, W, ip, sycl_queue), use_bias_(bias), act_(activation)  {
+  const size_t weight_size =
+      sizeof(DataType) * C * H * W * ip->GetC() * ip->GetH() * ip->GetW();
+  const size_t bias_size = sizeof(DataType) * C * H * W;
+  
+  weights_ = (DataType*)sycl::malloc_device(weight_size, sycl_queue_);
+
+  if (use_bias_) {
+    biases_ = (DataType *)sycl::malloc_device(bias_size, sycl_queue_);
+  } else {
+    biases_ = nullptr;
+  }
+}
+
+template <>
+void FCLayer<sycl::half>::LoadWeights(float* cpuWeight, float* cpuBias,
+                                void* scratch) {
+  const size_t num_weights =
+      C * H * W * input_->GetC() * input_->GetH() * input_->GetW();
+  const size_t weight_size = sizeof(float) * num_weights;
+  const size_t num_biases = C * H * W;
+  const size_t bias_size = sizeof(float) * num_biases;
+
+  // also need to convert from fp32 to fp16
+  assert(scratch);
+  
+  sycl_queue_.memcpy(scratch, cpuWeight, weight_size).wait();
+
+  if (nhwc_) {
+    convertNCHWtoNHWC((sycl::half*)weights_, (float*)scratch, (int)num_biases,
+                      input_->GetC(), (int)num_biases, input_->GetC(),
+                      input_->GetH(), input_->GetW(), sycl_queue_);
+  } else {
+    copyTypeConverted((sycl::half*)weights_, (float*)scratch, (int)num_weights, sycl_queue_);
+  }
+
+  if (cpuBias) {
+    sycl_queue_.memcpy(scratch, cpuBias, bias_size).wait();
+    copyTypeConverted((sycl::half*)biases_, (float*)scratch, (int)num_biases, sycl_queue_);
+  } 
+} 
+
+template <>
+void FCLayer<float>::LoadWeights(float* cpuWeight, float* cpuBias,
+                                 void* /*scratch*/) {
+  const size_t num_weights =
+      C * H * W * input_->GetC() * input_->GetH() * input_->GetW();
+  const size_t weight_size = sizeof(float) * num_weights;
+  const size_t num_biases = C * H * W;
+  const size_t bias_size = sizeof(float) * num_biases;
+
+  
+  sycl_queue_.memcpy(weights_, cpuWeight, weight_size);
+  
+  if (use_bias_) {
+    sycl_queue_.memcpy(biases_, cpuBias, bias_size);
+  }
+
+  //sycl_queue_.wait();
+}
+
+template <>
+ void FCLayer<sycl::half>::Eval(int N, sycl::half* output_tensor, const sycl::half* input_tensor,
+                          const sycl::half* /*input2*/, void* /*scratch*/,
+                          size_t /*scratch_size*/, sycl::queue &sycl_queue, sycl::half***) {
+
+   //CERR << "FCLayer<sycl::half>::Eval. ";
+
+   const int num_outputs = C * H * W;
+   const int num_inputs = input_->GetC() * input_->GetH() * input_->GetW();
+
+   //sycl::half alpha = float2half_rn(1.0f), 
+   //beta = float2half_rn(0.0f);
+   
+   #ifdef USE_CUBLAS
+    __half_raw one_h{0x3C00};
+    __half_raw zero_h{0};
+    half alpha = one_h;
+    half beta = zero_h;
+
+    #elif defined(USE_HIPBLAS)
+    hipblasHalf alpha{1};
+    hipblasHalf beta{0};
+
+    #else
+    sycl::half alpha = 1;
+    sycl::half beta = 0;
+    #endif
+
+   #ifdef USE_CUBLAS
+    cublasHandle_t handle = cuBlasContextManager::getcuBlasHandle_t();
+
+    sycl_queue.submit([&](sycl::handler &cgh) {
+        
+         cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+
+         auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
+         cublasSetStream(handle, cudaStreamHandle);    
+  
+         ReportCUBLASErrors(cublasHgemm(handle, transpose_type_transpose, transpose_type_notranspose, num_outputs,
+                                  N, num_inputs, &alpha, ((const half *)weights_), num_inputs,
+                                  ((const half *)input_tensor), num_inputs, &beta, ((half *)output_tensor),
+                                  num_outputs));
+
+       });
+   });  
+#elif defined(USE_HIPBLAS)
+  hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t();
+  sycl_queue.submit([&](sycl::handler &cgh) {
+    cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+      auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
+      hipblasSetStream(handle, hipStreamHandle);
+
+      hipblasHgemm(
+          handle, transpose_type_transpose, transpose_type_notranspose,
+          num_outputs, N, num_inputs, &alpha, ((const hipblasHalf *)weights_),
+          num_inputs, ((const hipblasHalf *)input_tensor), num_inputs, &beta,
+          ((hipblasHalf *)output_tensor), num_outputs);
+
+      });
+  });
+#else
+  oneapi::mkl::blas::column_major::gemm(
+      sycl_queue, transpose_type_transpose, transpose_type_notranspose,
+      num_outputs, N, num_inputs, alpha, ((const sycl::half *)weights_),
+      num_inputs, ((const sycl::half *)input_tensor), num_inputs, beta,
+      ((sycl::half *)output_tensor), num_outputs);
+#endif
+
+   if (use_bias_ || (act_ != ACTIVATION_NONE)) {
+     addVectors(output_tensor, biases_, output_tensor, num_outputs * N,
+                num_outputs, num_outputs * N, act_, sycl_queue);
+   }
+ } 
+
+template <>
+void FCLayer<float>::Eval(int N, float* output_tensor,
+                          const float* input_tensor, const float* /*input2*/,
+                          void* /*scratch*/, size_t /*scratch_size*/, sycl::queue &sycl_queue, float***) {
+
+  //CERR << "FCLayer<float>::Eval. ";
+
+  const int num_outputs = C * H * W;
+  const int num_inputs = input_->GetC() * input_->GetH() * input_->GetW();
+
+  float alpha = 1.0f, beta = 0.0f;
+  //CERR << "FCLayer<float>::Eval - 1. " << num_inputs << " " << num_outputs;
+
+  #ifdef USE_CUBLAS
+  cublasHandle_t handle = cuBlasContextManager::getcuBlasHandle_t();
+
+  sycl_queue.submit([&](sycl::handler &cgh) {
+        
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+
+        auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
+        cublasSetStream(handle, cudaStreamHandle);    
+
+
+        ReportCUBLASErrors(cublasSgemm(handle, transpose_type_transpose, transpose_type_notranspose, num_outputs,
+                                 N, num_inputs, &alpha, weights_, num_inputs,
+                                 input_tensor, num_inputs, &beta, output_tensor,
+                                 num_outputs));
+
+      });
+  });  
+  #elif defined(USE_HIPBLAS)
+  hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t();
+  sycl_queue.submit([&](sycl::handler &cgh) {
+      cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+        auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
+
+        hipblasSetStream(handle, hipStreamHandle);    
+
+
+        hipblasSgemm(handle, transpose_type_transpose, transpose_type_notranspose, num_outputs,
+                                 N, num_inputs, &alpha, weights_, num_inputs,
+                                 input_tensor, num_inputs, &beta, output_tensor,
+                                 num_outputs);
+
+        
+      });
+  });
+  #else
+   //printf("3\n");
+   oneapi::mkl::blas::column_major::gemm(sycl_queue, transpose_type_transpose,
+        transpose_type_notranspose, num_outputs, N, num_inputs, alpha,
+        weights_, num_inputs, input_tensor, num_inputs, beta, output_tensor,
+        num_outputs);
+    
+    //event.wait();
+  #endif
+
+
+  if (use_bias_ || (act_ != ACTIVATION_NONE)) {
+    addVectors(output_tensor, biases_, output_tensor, num_outputs * N,
+               num_outputs, num_outputs * N, act_, sycl_queue);
+  }
+}
+
+template <typename DataType>
+FCLayer<DataType>::~FCLayer() {
+  sycl::free(weights_, sycl_queue_);
+  sycl::free(biases_, sycl_queue_);
+}
+
+template <typename DataType>
+PolicyMapLayer<DataType>::PolicyMapLayer(BaseLayer<DataType>* ip, int C, int H,
+                                         int W, int usedSize, bool attention, sycl::queue& sycl_queue)
+    : BaseLayer<DataType>(C, H, W, ip, sycl_queue),
+      used_size_(usedSize),
+      attention_map_(attention) {
+
+  size_t weight_size = sizeof(short) * this->input_->GetC() * 64;
+
+  if (attention) weight_size = sizeof(short) * usedSize;
+  
+  weights_ = (short *)sycl::malloc_device(weight_size, sycl_queue_);
+}
+
+template <typename DataType>
+void PolicyMapLayer<DataType>::LoadWeights(const short* cpuWeight,
+                                           void* /*scratch*/) {
+
+  size_t weight_size = sizeof(short) * used_size_;
+
+  if (nhwc_ && !attention_map_) {
+    // convert CHW to HWC
+    int C = used_size_ / 64;
+    int Cin = this->input_->GetC();
+
+    // C is the no. of channels actually used (typically 73).
+    // Cin the the no. of channels in previous layer (padded up to 80).
+    // Weights of this layer is a mapping to select which output index of the
+    // policy vector (1858 elements) maps to every element of input
+    // tensor (assuming NCHW layout). Note that there are 73x64 valid inputs
+    // (80x64 taking padding), and only 1858 outputs so the mapping isn't
+    // one to one. Only few of the indices point to valid index in policy
+    // vector. Invalid entries are set to -1.
+
+    // In fp16 mode, the tensor layout is NHWC so the weights need to be
+    // adjusted to make them work as intended.
+
+    // This is how the original weights looks like (CHW layout):
+    /*
+               HW (64)
+       ----|-------------|
+           |             |
+           |             |
+    C (73) |             |
+           |             |
+           |             |
+       ------------------|   Cin (80)
+           |  padding    |
+           |-------------|
+    */
+    // The padding is not part of the weights provided (used_size_ is 73 x 64).
+    //
+    // The weights converted to HWC looks like this
+    /*
+                 C (73)
+            |-------------|---|
+            |             | P |
+            |             | a |
+    HW (64) |             | d |
+            |             |   |
+            |             |   |
+            |-----------------|
+                     Cin (80)
+    */
+    // In HWC, because the padding is now part of each row
+    // we need to increase the size of weights to account
+    // for it.
+    // The pad elements point to -1 (invalid output index) and the
+    // same kernel works for both HWC and CHW layouts after used_size_
+    // is updated to include padding (80x64).
+
+    used_size_ = Cin * 64;
+    std::vector<short> convertedWeights(used_size_);
+
+    for (int hw = 0; hw < 64; hw++)
+      for (int c = 0; c < Cin; c++) {
+        if (c < C)
+          convertedWeights[hw * Cin + c] = cpuWeight[c * 64 + hw];
+        else
+          convertedWeights[hw * Cin + c] = -1;
+      }
+    sycl_queue_.memcpy(weights_, convertedWeights.data(),
+                       used_size_ * sizeof(short)).wait();
+  } else {
+    sycl_queue_.memcpy(weights_, cpuWeight, weight_size).wait();
+  }
+}
+
+template <typename DataType>
+void PolicyMapLayer<DataType>::Eval(
+    int N, DataType* output_tensor, const DataType* input_tensor,
+    const DataType* /*input2*/, void* /*scratch*/, size_t /*scratch_size*/, sycl::queue &sycl_queue, DataType***) {
+  
+  //CERR << "PolicyMapLayer<DataType>::Eval. ";    
+
+  int inputSize =
+      this->input_->GetC() * this->input_->GetH() * this->input_->GetW();
+  if (attention_map_) inputSize = used_size_;
+  int outputSize = this->C * this->H * this->W;
+
+  PolicyMap(N, output_tensor, input_tensor, weights_, inputSize, used_size_, outputSize, sycl_queue);
+}
+
+template <typename DataType> PolicyMapLayer<DataType>::~PolicyMapLayer() {
+  free(weights_, sycl_queue_);
+}
+
+template <typename DataType>
+FusedWinogradConvSELayer<DataType>::FusedWinogradConvSELayer(
+    BaseLayer<DataType>* ip, int C, int H, int W, int Cin,
+    ActivationFunction activation, bool bias, bool skip_add, bool se, int se_k,
+    sycl::queue &sycl_queue, bool op_nhcw)
+    : BaseLayer<DataType>(C, H, W, ip, false, sycl_queue),
+      c_input_(Cin),
+      act_(activation),
+      use_bias_(bias),
+      skip_add_(skip_add),
+      has_se_(se),
+      se_k_(se_k),
+      op_nhcw_(op_nhcw){
+
+  if (act_ != ACTIVATION_RELU && act_ != ACTIVATION_MISH && act_ != ACTIVATION_NONE) {
+    throw Exception("Unsupported activation for fused winograd conv SE layer.");
+  }
+
+  // Allocate memory for weights (filter tensor) and biases.
+  const size_t weight_size = sizeof(DataType) * c_input_ * C * 3 * 3;
+
+  if (use_bias_) {
+    const size_t bias_size = sizeof(DataType) * C;
+    biases_ = (DataType *)sycl::malloc_device(bias_size, sycl_queue_);
+  }
+
+  // 6x6 transformed filter size, for 3x3 convolution
+  transformed_weights_ = (DataType *)sycl::malloc_device(weight_size * 4, sycl_queue_);
+
+  if (has_se_) {
+    const size_t num_weights1 = C * se_k_;
+    const size_t num_weights2 = num_weights1 * 2;
+    const size_t num_biases1 = se_k_;
+    const size_t num_biases2 = 2 * C;
+
+    const size_t weight_size1 = sizeof(DataType) * num_weights1;
+    const size_t weight_size2 = sizeof(DataType) * num_weights2;
+    const size_t biases_size1 = sizeof(DataType) * num_biases1;
+    const size_t biases_size2 = sizeof(DataType) * num_biases2;
+
+    w1_ = (DataType *)sycl::malloc_device(weight_size1 * 4, sycl_queue_);
+    w2_ = (DataType *)sycl::malloc_device(weight_size2 * 4, sycl_queue_);
+    b1_ = (DataType *)sycl::malloc_device(biases_size1 * 4, sycl_queue_);
+    b2_ = (DataType *)sycl::malloc_device(biases_size2 * 4, sycl_queue_);
+  }
+}
+
+template <typename DataType> void FusedWinogradConvSELayer<DataType>::LoadWeights(float* pfilter,
+                                                     float* pBias,
+                                                     void* scratch) {
+  const size_t weight_size = sizeof(float) * c_input_ * C * 3 * 3;
+  const size_t bias_size = sizeof(float) * C;
+
+  // Store untransformed weights in scratch.
+  const DataType* weights = (DataType*)scratch + weight_size + bias_size;
+
+  // first copy from CPU memory to scratch space in GPU memory
+  // and then do the type conversion using a kernel
+  assert(scratch);
+  //sycl_queue_.memcpy(scratch, pfilter, weight_size).wait_and_throw();
+  sycl_queue_.memcpy(scratch, pfilter, weight_size).wait();
+  copyTypeConverted((DataType*)weights, (float*)scratch, C * c_input_ * 3 * 3, sycl_queue_);
+
+  if (pBias) {
+    
+    
+    //sycl_queue_.memcpy(scratch, pBias, bias_size).wait();
+    sycl_queue_.memcpy(scratch, pBias, bias_size);  
+
+    float total = 0;
+    for(int i = 0; i < C; i++)
+      total = pBias[i] + total;
+
+    copyTypeConverted((DataType*)biases_, (float*)scratch, C, sycl_queue_);
+  }
+
+  // run winograd transform kernel for the filter
+  FilterTransform(C, c_input_, transformed_weights_, weights, sycl_queue_);
+}
+
+// TODO: Do this on the GPU to improve network load time!
+static inline void CpuTranspose(float* op, float* ip, size_t rows,
+                                size_t cols) {
+  for (size_t i = 0; i < rows; i++)
+    for (size_t j = 0; j < cols; j++) op[j * rows + i] = ip[i * cols + j];
+}
+
+template <typename DataType>
+void FusedWinogradConvSELayer<DataType>::LoadSEWeights(float* w1, float* b1,
+                                                       float* w2, float* b2,
+                                                       void* scratch) {
+  const size_t num_weights1 = C * se_k_;
+  const size_t num_weights2 = num_weights1 * 2;
+  const size_t num_biases1 = se_k_;
+  const size_t num_biases2 = 2 * C;
+
+  // The shader uses transposed weight matrices.
+  std::vector<float> temp_transposed(num_weights2);
+
+  CpuTranspose(temp_transposed.data(), w1, se_k_, C);
+  //sycl_queue_.memcpy(scratch, temp_transposed.data(), num_weights1 * sizeof(float)).wait();
+  sycl_queue_.memcpy(scratch, temp_transposed.data(), num_weights1 * sizeof(float)).wait();
+  
+  copyTypeConverted((DataType*)w1_, (float*)scratch, (int)num_weights1, sycl_queue_);
+
+  CpuTranspose(temp_transposed.data(), w2, 2 * C, se_k_);
+
+  //sycl_queue_.memcpy(scratch, temp_transposed.data(), num_weights2 * sizeof(float)).wait();
+  sycl_queue_.memcpy(scratch, temp_transposed.data(), num_weights2 * sizeof(float)).wait();
+  copyTypeConverted((DataType*)w2_, (float*)scratch, (int)num_weights2, sycl_queue_);
+
+  //sycl_queue_.memcpy(scratch, b1, num_biases1 * sizeof(float)).wait();
+  sycl_queue_.memcpy(scratch, b1, num_biases1 * sizeof(float)).wait();
+  copyTypeConverted((DataType*)b1_, (float*)scratch, (int)num_biases1, sycl_queue_);
+
+  //sycl_queue_.memcpy(scratch, b2, num_biases2 * sizeof(float)).wait();
+  sycl_queue_.memcpy(scratch, b2, num_biases2 * sizeof(float)).wait();
+  copyTypeConverted((DataType*)b2_, (float*)scratch, (int)num_biases2, sycl_queue_);
+
+}
+
+template <>
+ void BaseLayer<sycl::half>::cublasRowMajorMatrixMul(const sycl::half* A, const sycl::half* B,
+                                               sycl::half* Out, int M, int N, int K,
+                                               int batchSize, sycl::queue &sycl_queue) {
+   // Need to initialize 1.0 and 0.0 as hexadecimal for fp16 because typecasting
+   // float to sycl::half type doesn't work before CUDA 10.0
+   #ifdef USE_CUBLAS
+    __half_raw one_h{0x3C00};
+    __half_raw zero_h{0};
+    half alpha = one_h;
+    half beta = zero_h;
+
+    #else
+    sycl::half alpha = 1;
+    sycl::half beta = 0;
+    #endif
+
+   // dimensions of matrix A = M x K
+   // dimensions of matrix B = K x N
+   // dimensions of output   = M x N
+
+   // cublas supports only col major output
+   // to multiply row major matrices, use the trick below
+  
+  #ifdef USE_CUBLAS
+   cublasHandle_t handle = cuBlasContextManager::getcuBlasHandle_t();
+  
+   sycl_queue.submit([&](sycl::handler &cgh) {
+        //auto d_A = b_A.get_access<sycl::access::mode::read_write>(cgh);
+         cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+  
+          auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
+          cublasSetStream(handle, cudaStreamHandle);
+
+          ReportCUBLASErrors(cublasGemmStridedBatchedEx(
+             handle, transpose_type_notranspose, transpose_type_notranspose, N, M, K, &one_h, B, CUDA_R_16F, N,
+             N * K, A, CUDA_R_16F, K, K * M, &zero_h, Out, CUDA_R_16F, N, N * M,
+             batchSize, CUDA_R_16F, CUBLAS_GEMM_DEFAULT));
+
+          
+         });   
+   });
+  
+#elif defined(USE_HIPBLAS)
+  hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t();
+
+  sycl_queue.submit([&](sycl::handler &cgh) {
+    cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+      auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
+      hipblasSetStream(handle, hipStreamHandle);
+
+      hipblasGemmStridedBatchedEx(
+          handle, transpose_type_notranspose, transpose_type_notranspose, N, M,
+          K, &alpha, B, HIPBLAS_R_16F, N, N * K, A, HIPBLAS_R_16F, K, K * M,
+          &beta, Out, HIPBLAS_R_16F, N, N * M, batchSize, HIPBLAS_COMPUTE_16F,
+          HIPBLAS_GEMM_DEFAULT);
+
+    });
+  });
+#else
+  int64_t M_ = M;
+  int64_t N_ = N;
+  int64_t K_ = K;
+  oneapi::mkl::blas::column_major::gemm_batch(
+      sycl_queue, transpose_type_notranspose, transpose_type_notranspose, N_,
+      M_, K_, alpha, B, N_, N_ * K_, A, K_, K_ * M_, beta, Out, N_, N_ * M_,
+      batchSize);
+#endif
+ }
+
+template <> void BaseLayer<float>::cublasRowMajorMatrixMul(const float* A, const float* B,
+                                               float* Out, int M, int N, int K,
+                                               int batchSize, sycl::queue &sycl_queue) {
+  float floatOne = 1.0f;
+  float floatZero = 0.0f;
+
+  int64_t M_ = M;
+  int64_t N_ = N;
+  int64_t K_ = K;
+
+  #ifdef USE_CUBLAS
+  //static cublasHandle_t handle;
+  //ReportCUBLASErrors(cublasCreate(&handle)); 
+  cublasHandle_t handle = cuBlasContextManager::getcuBlasHandle_t();
+  #endif
+
+  #ifdef USE_HIPBLAS
+  hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t();
+  #endif
+
+  {
+    #ifdef USE_CUBLAS
+    sycl_queue.submit([&](sycl::handler &cgh) {
+        //auto d_A = b_A.get_access<sycl::access::mode::read_write>(cgh);
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+            auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
+            cublasSetStream(handle, cudaStreamHandle);   
+
+          ReportCUBLASErrors(cublasGemmStridedBatchedEx(
+            handle, transpose_type_notranspose, transpose_type_notranspose, N, M, K, &floatOne, B, CUDA_R_32F, N,
+            N * K, A, CUDA_R_32F, K, K * M, &floatZero, Out, CUDA_R_32F, N, N * M,
+          batchSize, CUDA_R_32F, CUBLAS_GEMM_DEFAULT));
+
+          
+        });
+    });
+    #elif defined(USE_HIPBLAS)
+    sycl_queue.submit([&](sycl::handler &cgh) {
+        //auto d_A = b_A.get_access<sycl::access::mode::read_write>(cgh);
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+            auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
+            hipblasSetStream(handle, hipStreamHandle);   
+
+          hipblasGemmStridedBatchedEx(
+            handle, transpose_type_notranspose, transpose_type_notranspose, N, M, K, &floatOne, B, HIPBLAS_R_32F, N,
+            N * K, A, HIPBLAS_R_32F, K, K * M, &floatZero, Out, HIPBLAS_R_32F, N, N * M,
+          batchSize, HIPBLAS_COMPUTE_32F, HIPBLAS_GEMM_DEFAULT);
+
+          
+        });
+    });  
+    #else
+      oneapi::mkl::blas::column_major::gemm_batch(sycl_queue, transpose_type_notranspose,
+            transpose_type_notranspose, N_, M_, K_, floatOne, B, N_, N_ * K_, A, K_, K_ * M_, floatZero, Out, N_, N_ * M_, batchSize);
+    #endif
+  }
+}
+
+template <typename DataType>
+void FusedWinogradConvSELayer<DataType>::Eval(
+    int N, DataType* output, const DataType* input, const DataType* input2,
+    void* scratch, size_t scratch_size, sycl::queue &sycl_queue, DataType***) {
+  // Split the scratch space into two parts - use first part for holding
+  // transformed input and second part for transformed output.
+
+  //CERR << "FusedWinogradConvSELayer<DataType>::Eval. ";
+
+  DataType* transformed_input = (DataType*)scratch;
+  DataType* transformed_output =
+      transformed_input + scratch_size / (2 * sizeof(DataType));
+
+  InputTransform<DataType, false>(N, c_input_, transformed_input, input, sycl_queue);
+  BaseLayer<DataType>::cublasRowMajorMatrixMul(
+      transformed_input, transformed_weights_, transformed_output, N * 4, C, c_input_, 36, sycl_queue);
+
+  if (act_ == ACTIVATION_NONE) {
+    if (!has_se_ && use_bias_ && !skip_add_)
+      OutputTransform<DataType, false, ACTIVATION_NONE, true, false, false, false>(
+          N, C, 0, output, transformed_output, nullptr, biases_, nullptr, nullptr, nullptr, nullptr, sycl_queue);
+    else
+      throw Exception("unsupported network type!");
+  } else if (act_ == ACTIVATION_RELU) {
+    if (has_se_ && use_bias_ && skip_add_)
+      OutputTransform<DataType, true, ACTIVATION_RELU, true, true, false, false>(
+          N, C, se_k_, output, transformed_output, input2, biases_, w1_, b1_,
+          w2_, b2_, sycl_queue);
+    else if (!has_se_ && use_bias_ && !skip_add_) {
+      if (op_nhcw_)
+        OutputTransform<DataType, false, ACTIVATION_RELU, true, false, false, true>(
+            N, C, 0, output, transformed_output, nullptr, biases_, nullptr,
+            nullptr, nullptr, nullptr, sycl_queue);
+      else
+        OutputTransform<DataType, false, ACTIVATION_RELU, true, false, false, false>(
+            N, C, 0, output, transformed_output, nullptr, biases_, nullptr,
+            nullptr, nullptr, nullptr, sycl_queue);
+    } else if (!has_se_ && use_bias_ && skip_add_)
+      OutputTransform<DataType, false, ACTIVATION_RELU, true, true, false, false>(
+          N, C, 0, output, transformed_output, input2, biases_, nullptr,
+          nullptr, nullptr, nullptr, sycl_queue);
+    else
+      throw Exception("unsupported network type!");
+  } else if (act_ == ACTIVATION_MISH) {
+    if (has_se_ && use_bias_ && skip_add_)
+      OutputTransform<DataType, true, ACTIVATION_MISH, true, true, false, false>(
+          N, C, se_k_, output, transformed_output, input2, biases_, w1_, b1_,
+          w2_, b2_, sycl_queue);
+    else if (!has_se_ && use_bias_ && !skip_add_) {
+      if (op_nhcw_)
+        OutputTransform<DataType, false, ACTIVATION_MISH, true, false, false, true>(
+            N, C, 0, output, transformed_output, nullptr, biases_, nullptr,
+            nullptr, nullptr, nullptr, sycl_queue);
+      else
+        OutputTransform<DataType, false, ACTIVATION_MISH, true, false, false, false>(
+            N, C, 0, output, transformed_output, nullptr, biases_, nullptr,
+            nullptr, nullptr, nullptr, sycl_queue);
+    } else if (!has_se_ && use_bias_ && skip_add_)
+      OutputTransform<DataType, false, ACTIVATION_MISH, true, true, false, false>(
+          N, C, 0, output, transformed_output, input2, biases_, nullptr,
+          nullptr, nullptr, nullptr, sycl_queue);
+    else
+      throw Exception("unsupported network type!");
+  } else
+    throw Exception("unsupported network type!");
+}
+
+template <typename DataType>
+FusedWinogradConvSELayer<DataType>::~FusedWinogradConvSELayer() {
+  sycl::free(transformed_weights_, sycl_queue_);
+  if (use_bias_) sycl::free(biases_, sycl_queue_);
+  if (has_se_) {
+    sycl::free(w1_, sycl_queue_);
+    sycl::free(w2_, sycl_queue_);
+    sycl::free(b1_, sycl_queue_);
+    sycl::free(b2_, sycl_queue_);
+  }
+}
+
+template <typename DataType>
+Conv1Layer<DataType>::Conv1Layer(BaseLayer<DataType>* ip, int C, int H, int W,
+                                 int Cin, ActivationFunction activation,
+                                 bool bias, sycl::queue& sycl_queue)
+    : BaseLayer<DataType>(C, H, W, ip, false, sycl_queue),
+      c_input_(Cin),
+      act_(activation),
+      use_bias_(bias) {
+
+  // Allocate memory for weights (filter tensor) and biases.
+  const size_t weight_size = sizeof(DataType) * c_input_ * C * 1 * 1;
+  weights_ = (DataType *)sycl::malloc_device(weight_size, sycl_queue_);
+
+  if (use_bias_) {
+    const size_t bias_size = sizeof(DataType) * C;
+    //CERR << "Conv1Layer using bias " << bias_size; 
+    biases_ = (DataType *)sycl::malloc_device(bias_size, sycl_queue_);
+  }
+}
+
+template <typename DataType> void Conv1Layer<DataType>::LoadWeights(float* pfilter, float* pBias, void* scratch) {
+  const size_t weight_size = sizeof(float) * c_input_ * C * 1 * 1;
+  const size_t bias_size = sizeof(float) * C;
+
+  assert(scratch);
+
+  sycl_queue_.memcpy(scratch, pfilter, weight_size).wait();
+  copyTypeConverted((DataType*)weights_, (float*)scratch, C * c_input_ * 1 * 1, sycl_queue_);
+
+  if (pBias) {
+    sycl_queue_.memcpy(scratch, pBias, bias_size).wait();
+    copyTypeConverted((DataType*)biases_, (float*)scratch, C, sycl_queue_);
+  }
+}
+
+
+template <>
+  void Conv1Layer<sycl::half>::cublasSpecialMatrixMul(const sycl::half* A, const sycl::half* B,
+                                               sycl::half* Out, int M, int N, int K,
+                                               int batchSize, sycl::queue &sycl_queue) {
+
+   // Need to initialize 1.0 and 0.0 as hexadecimal for fp16 because typecasting
+  // float to sycl::half type doesn't work before CUDA 10.0
+
+  #ifdef USE_CUBLAS
+   cublasHandle_t handle = cuBlasContextManager::getcuBlasHandle_t();
+  #endif
+
+  #ifdef USE_HIPBLAS
+   hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t();
+  #endif
+
+
+   #ifdef USE_CUBLAS
+    __half_raw one_h{0x3C00};
+    __half_raw zero_h{0};
+    half alpha = one_h;
+    half beta = zero_h;
+
+    #else
+    sycl::half alpha = 1;
+    sycl::half beta = 0;
+    #endif
+
+#ifdef USE_CUBLAS
+    sycl_queue.submit([&](sycl::handler &cgh) {
+         
+         cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+  
+          auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
+          cublasSetStream(handle, cudaStreamHandle);
+
+
+         ReportCUBLASErrors(cublasGemmStridedBatchedEx(
+         handle, transpose_type_notranspose, transpose_type_notranspose, N, M, K, &one_h, B, CUDA_R_16F, N,
+         N * K, A, CUDA_R_16F, K, 0, &zero_h, Out, CUDA_R_16F, N, N * M,
+         batchSize, CUDA_R_16F, CUBLAS_GEMM_DEFAULT));
+
+         });   
+   });
+#elif defined(USE_HIPBLAS)
+    sycl_queue.submit([&](sycl::handler &cgh) {
+      cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+         auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
+         hipblasSetStream(handle, hipStreamHandle);
+         hipblasGemmStridedBatchedEx(
+              handle, transpose_type_notranspose, transpose_type_notranspose,
+              N, M, K, &alpha, B, HIPBLAS_R_16F, N, N * K, A, HIPBLAS_R_16F, K,
+              0, &beta, Out, HIPBLAS_R_16F, N, N * M, batchSize, HIPBLAS_COMPUTE_16F,
+              HIPBLAS_GEMM_DEFAULT);
+      });
+    });
+#else
+    int64_t M_ = M;
+    int64_t N_ = N;
+    int64_t K_ = K;
+    oneapi::mkl::blas::column_major::gemm_batch(
+        sycl_queue, transpose_type_notranspose, transpose_type_notranspose, N_,
+        M_, K_, alpha, B, N_, N_ * K_, A, K_, 0, beta, Out, N_, N_ * M_,
+        batchSize);
+#endif
+}
+
+template <>
+void Conv1Layer<float>::cublasSpecialMatrixMul(const float* A, const float* B,
+                                               float* Out, int M, int N, int K,
+                                               int batchSize, sycl::queue &sycl_queue) {
+  float floatOne = 1.0f;
+  float floatZero = 0.0f;
+
+
+  int64_t M_ = M;
+  int64_t N_ = N;
+  int64_t K_ = K;
+
+  #ifdef USE_CUBLAS
+   cublasHandle_t handle = cuBlasContextManager::getcuBlasHandle_t();
+  #endif
+
+  #ifdef USE_HIPBLAS
+   hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t();
+  #endif
+
+  // NOTE strideB set to 0 below!
+  {
+    #ifdef USE_CUBLAS
+    sycl_queue.submit([&](sycl::handler &cgh) {
+        
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+  
+         auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
+         cublasSetStream(handle, cudaStreamHandle);
+
+
+        ReportCUBLASErrors(cublasGemmStridedBatchedEx(
+          handle, transpose_type_notranspose, transpose_type_notranspose, N, M, K, &floatOne, B, CUDA_R_32F, N,
+          N * K, A, CUDA_R_32F, K, 0, &floatZero, Out, CUDA_R_32F, N, N * M,
+          batchSize, CUDA_R_32F, CUBLAS_GEMM_DEFAULT));
+
+        });   
+    });
+    #elif defined(USE_HIPBLAS)
+    sycl_queue.submit([&](sycl::handler &cgh) {
+        
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+         auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
+  
+         hipblasSetStream(handle, hipStreamHandle);
+
+
+        hipblasGemmStridedBatchedEx(
+          handle, transpose_type_notranspose, transpose_type_notranspose, N, M, K, &floatOne, B, HIPBLAS_R_32F, N,
+          N * K, A, HIPBLAS_R_32F, K, 0, &floatZero, Out, HIPBLAS_R_32F, N, N * M,
+          batchSize, HIPBLAS_COMPUTE_32F, HIPBLAS_GEMM_DEFAULT);
+
+        });   
+    });
+    #else
+      oneapi::mkl::blas::column_major::gemm_batch(
+        sycl_queue, transpose_type_notranspose,
+        transpose_type_notranspose, N_, M_, K_, floatOne, B, N_, N_ * K_, A, K_,
+        0, floatZero, Out, N_, N_ * M_, batchSize); 
+    #endif
+  }
+}
+
+template <typename DataType>
+void Conv1Layer<DataType>::Eval(int N, DataType* output, const DataType* input,
+                                const DataType* /*input2*/, void* /*scratch*/,
+                                size_t /*scratch_size*/,
+                                sycl::queue &sycl_queue, DataType***) {
+
+  //CERR << "Conv1Layer<DataType>::Eval. ";
+
+  cublasSpecialMatrixMul(weights_, input, output, C, H * W, c_input_, N, sycl_queue);
+ // CERR << "cublasSpecialMatrixMul. ";
+
+  if (use_bias_){
+  // CERR << "addBias. " << N << " " << C << " " << H << " " << W;
+    addBias_NCHW(output, output, biases_, N, C, H, W, act_, sycl_queue);
+  } else if (act_ != ACTIVATION_NONE) {
+    addVectors(output, output, (DataType*)nullptr, N * C * H * W, N * C * H * W, 0, act_, sycl_queue);
+  //  CERR << "addVectors. ";
+  }
+}
+
+template <typename DataType>
+Conv1Layer<DataType>::~Conv1Layer() {
+ 
+  free(weights_, sycl_queue_);
+  if (use_bias_) 
+    free(biases_, sycl_queue_);
+}
+
+template <typename DataType>
+ResidualBlock<DataType>::ResidualBlock(BaseLayer<DataType>* ip, int C, bool se,
+                                       int se_k, bool first,
+                                       bool last, ActivationFunction activation,
+                                       int shared_mem_size, sycl::queue& sycl_queue)
+    : BaseLayer<DataType>(C, 8, 8, ip, ip->isNHWC(), sycl_queue),
+      has_se_(se),
+      se_k_(se_k),
+      c_input_(C),
+      first_block_(first),
+      last_block_(last),
+      shared_mem_size_(shared_mem_size),
+      act_(activation) {
+
+  if (act_ != ACTIVATION_RELU && act_ != ACTIVATION_MISH) {
+    throw Exception("Unsupported activation for residual block.");
+  }
+
+  // Allocate memory for weights (filter tensor) and biases.
+  const size_t weight_size = sizeof(DataType) * C * C * 3 * 3;
+
+  const size_t bias_size = sizeof(DataType) * C;
+  biases0_ = (DataType *)sycl::malloc_device(bias_size, sycl_queue_);
+  biases1_ = (DataType *)sycl::malloc_device(bias_size, sycl_queue_);
+
+  // 6x6 transformed filter size, for 3x3 convolution
+  transformed_weights0_ = (DataType *)sycl::malloc_device(weight_size * 4, sycl_queue_);
+  transformed_weights1_ = (DataType *)sycl::malloc_device(weight_size * 4, sycl_queue_);  
+
+
+  if (has_se_) {
+    const size_t num_weights1 = C * se_k_;
+    const size_t num_weights2 = num_weights1 * 2;
+    const size_t num_biases1 = se_k_;
+    const size_t num_biases2 = 2 * C;
+
+    const size_t weight_size1 = sizeof(DataType) * num_weights1;
+    const size_t weight_size2 = sizeof(DataType) * num_weights2;
+    const size_t biases_size1 = sizeof(DataType) * num_biases1;
+    const size_t biases_size2 = sizeof(DataType) * num_biases2;
+
+
+    w1_ = (DataType *)sycl::malloc_device(weight_size1, sycl_queue_);
+    w2_ = (DataType *)sycl::malloc_device(weight_size2, sycl_queue_);
+    b1_ = (DataType *)sycl::malloc_device(biases_size1, sycl_queue_);
+    b2_ = (DataType *)sycl::malloc_device(biases_size2, sycl_queue_);
+
+  }
+}
+
+template <typename DataType>
+void ResidualBlock<DataType>::LoadWeights0(float* pfilter, float* pBias,
+                                           void* scratch) {
+
+  const size_t weight_size = sizeof(float) * c_input_ * C * 3 * 3;
+  const size_t bias_size = sizeof(float) * C;
+
+  // Store untransformed weights in scratch.
+  const DataType* weights = (DataType*)scratch + weight_size;
+
+  // first copy from CPU memory to scratch space in GPU memory
+  // and then do the type conversion using a kernel
+  assert(scratch);
+  sycl_queue_.memcpy(scratch, pfilter, weight_size).wait();
+
+  copyTypeConverted((DataType*)weights, (float*)scratch, C * c_input_ * 3 * 3, sycl_queue_);
+
+  if (pBias) {
+    sycl_queue_.memcpy(scratch, pBias, bias_size).wait();
+    copyTypeConverted((DataType*)biases0_, (float*)scratch, C, sycl_queue_);
+  }
+
+  // run winograd transform kernel for the filter
+  FilterTransform(C, c_input_, transformed_weights0_, weights, sycl_queue_);
+}
+
+template <typename DataType> void ResidualBlock<DataType>::LoadWeights1(float* pfilter, float* pBias, void* scratch) {
+  
+  const size_t weight_size = sizeof(float) * C * C * 3 * 3;
+  const size_t bias_size = sizeof(float) * C;
+
+  // Store untransformed weights in scratch.
+  const DataType* weights = (DataType*)scratch + weight_size;
+
+  // first copy from CPU memory to scratch space in GPU memory
+  // and then do the type conversion using a kernel
+  assert(scratch);
+  sycl_queue_.memcpy(scratch, pfilter, weight_size).wait();
+
+  copyTypeConverted((DataType*)weights, (float*)scratch, C * C * 3 * 3, sycl_queue_);
+
+  if (pBias) {
+    sycl_queue_.memcpy(scratch, pBias, bias_size);
+    copyTypeConverted((DataType*)biases1_, (float*)scratch, C, sycl_queue_);
+  }
+
+  // run winograd transform kernel for the filter
+  FilterTransform(C, C, transformed_weights1_, weights, sycl_queue_);
+}
+
+template <typename DataType> void ResidualBlock<DataType>::LoadSEWeights(float* w1, float* b1, float* w2, float* b2, void* scratch) {
+
+  const size_t num_weights1 = C * se_k_;
+  const size_t num_weights2 = num_weights1 * 2;
+  const size_t num_biases1 = se_k_;
+  const size_t num_biases2 = 2 * C;
+
+  // The shader uses transposed weight matrices.
+  std::vector<float> temp_transposed(num_weights2);
+
+  CpuTranspose(temp_transposed.data(), w1, se_k_, C);
+  
+  sycl_queue_.memcpy(scratch, temp_transposed.data(), num_weights1 * sizeof(float)).wait();
+  copyTypeConverted((DataType*)w1_, (float*)scratch, (int)num_weights1, sycl_queue_);
+
+  CpuTranspose(temp_transposed.data(), w2, 2 * C, se_k_);
+  
+  sycl_queue_.memcpy(scratch, temp_transposed.data(), num_weights2 * sizeof(float)).wait(); 
+  copyTypeConverted((DataType*)w2_, (float*)scratch, (int)num_weights2, sycl_queue_);
+
+  
+  sycl_queue_.memcpy(scratch, b1, num_biases1 * sizeof(float)).wait();
+  copyTypeConverted((DataType*)b1_, (float*)scratch, (int)num_biases1, sycl_queue_);
+
+  
+  sycl_queue_.memcpy(scratch, b2, num_biases2 * sizeof(float)).wait();
+  copyTypeConverted((DataType*)b2_, (float*)scratch, (int)num_biases2, sycl_queue_);
+}
+
+template <typename DataType>
+void ResidualBlock<DataType>::Eval(int N, DataType* output,
+                                   const DataType* input,
+                                   const DataType* /*input2*/, void* scratch,
+                                   size_t scratch_size, sycl::queue &sycl_queue, DataType***) {
+
+  //CERR << "ResidualBlock<DataType>::Eval. ";
+  // normally:
+  // - "output" initially contains the transformed input,
+  //    and after this layer, it contains the transformed input for next layer
+  // - "input" contains the original/untransformed input
+  // special cases:
+  //   - for first_block_, input is real input (untransformed)
+  //   - for last_block_, output is the final output of this block
+  //   (untransformed)
+
+  // Split the scratch space into two parts - use first part for holding
+  // transformed input and second part for transformed output.
+  DataType* transformed_input;
+  DataType* transformed_output;
+  if (!scratch) {
+    // Caller wants us to sub-allocate all memory we need from "output" tensor.
+    transformed_input = output;  // This is true in normal cases too!
+    transformed_output = transformed_input + (N * C * 8 * 8 * 36 / 16);
+  } else {
+    transformed_input = (DataType*)scratch;
+    transformed_output =
+        transformed_input + scratch_size / (2 * sizeof(DataType));
+  }
+
+  if (first_block_) {
+    InputTransform<DataType, true>(N, c_input_, transformed_input, input, sycl_queue_);
+    BaseLayer<DataType>::cublasRowMajorMatrixMul(
+        transformed_input, transformed_weights0_, transformed_output, N * 4, C,
+        c_input_, 36, sycl_queue);
+  } else {
+    BaseLayer<DataType>::cublasRowMajorMatrixMul(output, transformed_weights0_,
+                                                 transformed_output, N * 4, C,
+                                                 c_input_, 36, sycl_queue);
+  }
+
+  if (act_ == ACTIVATION_RELU) {
+    OutputInputTransform<DataType, false, ACTIVATION_RELU, true, false>(
+        N, C, 0, transformed_input, transformed_output, nullptr, biases0_,
+        nullptr, nullptr, nullptr, nullptr, sycl_queue);
+  } else if (act_ == ACTIVATION_MISH) {
+    OutputInputTransform<DataType, false, ACTIVATION_MISH, true, false>(
+        N, C, 0, transformed_input, transformed_output, nullptr, biases0_,
+        nullptr, nullptr, nullptr, nullptr, sycl_queue);
+  }
+  // "transformed_input" tensor now contains transformed input for the next
+  // convolution
+
+  BaseLayer<DataType>::cublasRowMajorMatrixMul(
+      transformed_input, transformed_weights1_, transformed_output, N * 4, C, C,
+      36, sycl_queue);
+
+  const bool fp16 = std::is_same<sycl::half, DataType>::value;
+  bool allowFusing =
+      (C <= kMaxResBlockFusingChannels) ||
+      (fp16 && (shared_mem_size_ >= kMaxResBlockFusingSeFp16AmpereSmem) &&
+       (C <= kMaxResBlockFusingSeKFp16Ampere));
+
+  if (act_ == ACTIVATION_RELU) {
+    if (last_block_) {
+      if (has_se_)
+        OutputTransform<DataType, true, ACTIVATION_RELU, true, true, true,
+                        false>(N, C, se_k_, output, transformed_output, input,
+                               biases1_, w1_, b1_, w2_, b2_, sycl_queue);
+      else
+        OutputTransform<DataType, false, ACTIVATION_RELU, true, true, true,
+                        false>(N, C, se_k_, output, transformed_output, input,
+                               biases1_, w1_, b1_, w2_, b2_, sycl_queue);
+    } else {
+      if (has_se_) {
+        if (allowFusing) {
+          OutputInputTransform<DataType, true, ACTIVATION_RELU, true, true>(
+              N, C, se_k_, output, transformed_output, input, biases1_, w1_,
+              b1_, w2_, b2_, sycl_queue);
+        } else {
+          OutputTransform<DataType, true, ACTIVATION_RELU, true, true, true,
+                          true>(N, C, se_k_, (DataType*)input,
+                                transformed_output, input, biases1_, w1_, b1_,
+                                w2_, b2_, sycl_queue);
+          InputTransform<DataType, true>(N, C, output, (DataType*)input,
+                                         sycl_queue);
+        }
+      } else
+        OutputInputTransform<DataType, false, ACTIVATION_RELU, true, true>(
+            N, C, se_k_, output, transformed_output, input, biases1_, w1_, b1_,
+            w2_, b2_, sycl_queue);
+    }
+  } else if (act_ == ACTIVATION_MISH) {
+    if (last_block_) {
+      if (has_se_)
+        OutputTransform<DataType, true, ACTIVATION_MISH, true, true, true,
+                        false>(N, C, se_k_, output, transformed_output, input,
+                               biases1_, w1_, b1_, w2_, b2_, sycl_queue);
+      else
+        OutputTransform<DataType, false, ACTIVATION_MISH, true, true, true,
+                        false>(N, C, se_k_, output, transformed_output, input,
+                               biases1_, w1_, b1_, w2_, b2_, sycl_queue);
+    } else {
+      if (has_se_) {
+        if (allowFusing) {
+          OutputInputTransform<DataType, true, ACTIVATION_MISH, true, true>(
+              N, C, se_k_, output, transformed_output, input, biases1_, w1_,
+              b1_, w2_, b2_, sycl_queue);
+        } else {
+          OutputTransform<DataType, true, ACTIVATION_MISH, true, true, true,
+                          true>(N, C, se_k_, (DataType*)input,
+                                transformed_output, input, biases1_, w1_, b1_,
+                                w2_, b2_, sycl_queue);
+          InputTransform<DataType, true>(N, C, output, (DataType*)input,
+                                         sycl_queue);
+        }
+      } else
+        OutputInputTransform<DataType, false, ACTIVATION_MISH, true, true>(
+            N, C, se_k_, output, transformed_output, input, biases1_, w1_, b1_,
+            w2_, b2_, sycl_queue);
+    }
+  }
+  // "output" tensor now contains transformed input for the next
+  // convolution
+}
+
+template <typename DataType>
+ResidualBlock<DataType>::~ResidualBlock() {
+
+  free(transformed_weights0_, sycl_queue_);
+  free(biases0_, sycl_queue_);
+  free(transformed_weights1_, sycl_queue_);
+  free(biases1_, sycl_queue_);
+  if (has_se_) {
+    free(w1_, sycl_queue_);
+    free(w2_, sycl_queue_);
+    free(b1_, sycl_queue_);
+    free(b2_, sycl_queue_);
+  }
+}
+
+template <typename DataType>
+void allocAndUpload(DataType** gpu_dest, std::vector<float> cpu_src,
+                    void* scratch, sycl::queue &sycl_queue) {
+  size_t size = cpu_src.size() * sizeof(DataType);
+  if (size == 0) {
+    *gpu_dest = nullptr;
+    return;
+  }
+
+
+  *gpu_dest = (DataType*)sycl::malloc_device(size, sycl_queue);
+
+   sycl_queue.memcpy(scratch, &cpu_src[0], cpu_src.size() * sizeof(float)).wait();
+
+   copyTypeConverted((DataType*)(*gpu_dest), (float*)scratch, (int)cpu_src.size(), sycl_queue);
+}
+
+template <typename DataType>
+AttentionPolicyHead<DataType>::AttentionPolicyHead(
+    BaseLayer<DataType>* ip, const MultiHeadWeights::PolicyHead& weights,
+    void* scratch, bool attention_body, ActivationFunction act,
+    int max_batch_size, sycl::queue &sycl_queue)
+    : BaseLayer<DataType>(64 * 64 + 24 * 8, 1, 1, ip, sycl_queue),
+      attention_body_(attention_body),
+      // Old networks without attention body (e.g. T79) use hardcoded SELU
+      // activations.
+      act_(attention_body ? act : ACTIVATION_SELU) {
+  embedding_op_size_ = weights.ip_pol_b.size();
+  wq_op_size_ = weights.ip2_pol_b.size();
+  wk_op_size_ = weights.ip3_pol_b.size();
+
+  encoder_heads_ = weights.pol_encoder_head_count;
+  policy_d_model_ = wq_op_size_;
+
+  allocAndUpload<DataType>(&ip_pol_w_, weights.ip_pol_w, scratch, sycl_queue_);
+  allocAndUpload<DataType>(&ip_pol_b_, weights.ip_pol_b, scratch, sycl_queue_);
+
+  allocAndUpload<DataType>(&ip2_pol_w_, weights.ip2_pol_w, scratch, sycl_queue_);
+  allocAndUpload<DataType>(&ip2_pol_b_, weights.ip2_pol_b, scratch, sycl_queue_);
+
+  allocAndUpload<DataType>(&ip3_pol_w_, weights.ip3_pol_w, scratch, sycl_queue_);
+  allocAndUpload<DataType>(&ip3_pol_b_, weights.ip3_pol_b, scratch, sycl_queue_);
+
+  // big allocation to hold wq and wk weights one after the other
+  {
+    size_t elements = weights.ip2_pol_w.size();
+    assert(elements == weights.ip3_pol_w.size());
+
+    size_t size = elements * sizeof(DataType) * 2;
+    wqk_w_ = (DataType*)sycl::malloc_device(size, sycl_queue_);
+    sycl_queue_.memcpy(wqk_w_, ip2_pol_w_, size / 2);
+    
+    sycl_queue_.memcpy(wqk_w_ + elements, ip3_pol_w_, size / 2);
+
+    elements = weights.ip2_pol_b.size();
+    size = elements * sizeof(DataType) * 2;
+    wqk_b_ = (DataType*)sycl::malloc_device(size, sycl_queue_);
+    sycl_queue_.memcpy(wqk_b_, ip2_pol_b_, size / 2);
+    sycl_queue_.memcpy(wqk_b_ + elements, ip3_pol_b_, size / 2);
+  }
+
+  allocAndUpload<DataType>(&ip4_pol_w_, weights.ip4_pol_w, scratch, sycl_queue_);
+
+  for (const auto& enc : weights.pol_encoder) {
+    EncoderBlock<DataType>* pW = new EncoderBlock<DataType>(
+        enc, scratch, encoder_heads_, embedding_op_size_,
+        1.0f,  // using alpha = 1 for now (TODO: may change?)
+        nullptr, 0,  // smolgen weights not implemented in
+                     // policy encoder heads yet.
+        max_batch_size, ACTIVATION_SWISH, act_,
+        1e-6, sycl_queue_);  // attentionbody nets don't have policy encoders, so using old
+                // epsilon for backward compatibility with T78.
+    encoder_weights_.emplace_back(pW);
+  }
+}
+
+template <typename DataType>
+EncoderBlock<DataType>::EncoderBlock(
+    const MultiHeadWeights::EncoderLayer& cpu_weights, void* scratch, int heads,
+    int size, float alpha, DataType* smolgen_global_scratch,
+    int smolgen_global_size, int max_batch_size, ActivationFunction smolgen_act,
+    ActivationFunction ffn_act, float default_eps, sycl::queue &sycl_queue)
+    : embedding_op_size_(size),
+      encoder_heads_(heads),
+      alpha_(alpha),
+      has_smolgen_(cpu_weights.mha.has_smolgen),
+      smolgen_activation_(smolgen_act),
+      ffn_activation_(ffn_act),
+      max_batch_size_(max_batch_size),
+      default_eps_(default_eps),
+      sycl_queue_(sycl_queue) {
+  mha_q_size_ = cpu_weights.mha.q_b.size();
+  mha_k_size_ = cpu_weights.mha.k_b.size();
+  mha_v_size_ = cpu_weights.mha.v_b.size();
+  mha_dense_size_ = cpu_weights.mha.dense_b.size();
+  ffn_dense1_size_ = cpu_weights.ffn.dense1_b.size();
+  ffn_dense2_size_ = cpu_weights.ffn.dense2_b.size();
+
+  allocAndUpload<DataType>(&mha_q_w, cpu_weights.mha.q_w, scratch, sycl_queue_);
+  allocAndUpload<DataType>(&mha_q_b, cpu_weights.mha.q_b, scratch, sycl_queue_);
+
+  allocAndUpload<DataType>(&mha_k_w, cpu_weights.mha.k_w, scratch, sycl_queue_);
+  allocAndUpload<DataType>(&mha_k_b, cpu_weights.mha.k_b, scratch, sycl_queue_);
+
+  allocAndUpload<DataType>(&mha_v_w, cpu_weights.mha.v_w, scratch, sycl_queue_);
+  allocAndUpload<DataType>(&mha_v_b, cpu_weights.mha.v_b, scratch, sycl_queue_);
+
+  // big allocation to hold qkv weights one after the other
+  {
+    size_t elements = cpu_weights.mha.q_w.size();
+    size_t size = elements * sizeof(DataType) * 3;
+    
+    mha_qkv_w = (DataType*)sycl::malloc_device(size, sycl_queue_);
+    sycl_queue_.memcpy(mha_qkv_w, mha_q_w, size / 3);
+    sycl_queue_.memcpy(mha_qkv_w + elements, mha_k_w, size / 3);
+    sycl_queue_.memcpy(mha_qkv_w + elements * 2, mha_v_w, size / 3);
+
+    elements = cpu_weights.mha.q_b.size();
+    size = elements * sizeof(DataType) * 3;
+    
+    mha_qkv_b = (DataType*)sycl::malloc_device(size, sycl_queue_);
+    sycl_queue_.memcpy(mha_qkv_b, mha_q_b, size / 3);
+    sycl_queue_.memcpy(mha_qkv_b + elements, mha_k_b, size / 3);
+    sycl_queue_.memcpy(mha_qkv_b + elements * 2, mha_v_b, size / 3);
+  }
+
+  allocAndUpload<DataType>(&mha_dense_w, cpu_weights.mha.dense_w, scratch, sycl_queue_);
+  allocAndUpload<DataType>(&mha_dense_b, cpu_weights.mha.dense_b, scratch, sycl_queue_);
+
+  allocAndUpload<DataType>(&ln1_gammas, cpu_weights.ln1_gammas, scratch, sycl_queue_);
+  allocAndUpload<DataType>(&ln1_betas, cpu_weights.ln1_betas, scratch, sycl_queue_);
+
+  allocAndUpload<DataType>(&ffn_dense1_w, cpu_weights.ffn.dense1_w, scratch, sycl_queue_);
+  allocAndUpload<DataType>(&ffn_dense1_b, cpu_weights.ffn.dense1_b, scratch, sycl_queue_);
+
+  allocAndUpload<DataType>(&ffn_dense2_w, cpu_weights.ffn.dense2_w, scratch, sycl_queue_);
+  allocAndUpload<DataType>(&ffn_dense2_b, cpu_weights.ffn.dense2_b, scratch, sycl_queue_);
+
+  allocAndUpload<DataType>(&ln2_gammas, cpu_weights.ln2_gammas, scratch, sycl_queue_);
+  allocAndUpload<DataType>(&ln2_betas, cpu_weights.ln2_betas, scratch, sycl_queue_);
+
+  // Smolgen weights.
+  if (has_smolgen_) {
+    smol_compress_size_ = cpu_weights.mha.smolgen.compress.size() / mha_q_size_;
+    smol_dense_1_size_ = cpu_weights.mha.smolgen.dense1_b.size();
+    smol_dense_2_size_ = cpu_weights.mha.smolgen.dense2_b.size();
+    smol_global_size_ = smolgen_global_size;
+
+    allocAndUpload<DataType>(&smol_compress, cpu_weights.mha.smolgen.compress,
+                             scratch, sycl_queue_);
+    allocAndUpload<DataType>(&smol_dense1_w, cpu_weights.mha.smolgen.dense1_w,
+                             scratch, sycl_queue_);
+    allocAndUpload<DataType>(&smol_dense1_b, cpu_weights.mha.smolgen.dense1_b,
+                             scratch, sycl_queue_);
+    allocAndUpload<DataType>(&smol_dense2_w, cpu_weights.mha.smolgen.dense2_w,
+                             scratch, sycl_queue_);
+    allocAndUpload<DataType>(&smol_dense2_b, cpu_weights.mha.smolgen.dense2_b,
+                             scratch, sycl_queue_);
+
+    allocAndUpload<DataType>(&smol_ln1_gammas,
+                             cpu_weights.mha.smolgen.ln1_gammas, scratch, sycl_queue_);
+    allocAndUpload<DataType>(&smol_ln1_betas, cpu_weights.mha.smolgen.ln1_betas,
+                             scratch, sycl_queue_);
+    allocAndUpload<DataType>(&smol_ln2_gammas,
+                             cpu_weights.mha.smolgen.ln2_gammas, scratch, sycl_queue_);
+    allocAndUpload<DataType>(&smol_ln2_betas, cpu_weights.mha.smolgen.ln2_betas,
+                             scratch, sycl_queue_);
+
+    // GPU memory already allocated in AttentionBody.
+    smol_global = smolgen_global_scratch;
+  }
+}
+
+template <typename DataType>
+static void cublasXgemm(transpose_type transa,
+                        transpose_type transb, int m, int n, int k,
+                        float alpha, const DataType* A, int lda,
+                        const DataType* B, int ldb, float beta, DataType* C,
+                        int ldc, sycl::queue &sycl_queue) {
+
+
+
+  const bool fp16 = std::is_same<sycl::half, DataType>::value;
+  
+  #ifdef USE_CUBLAS
+  cublasHandle_t handle = cuBlasContextManager::getcuBlasHandle_t();
+  if (fp16) {
+    unsigned short alpha_h = FP32toFP16(alpha);
+    unsigned short beta_h = FP32toFP16(beta);
+    sycl_queue.submit([&](sycl::handler &cgh) {
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+        auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
+        cublasSetStream(handle, cudaStreamHandle);    
+        ReportCUBLASErrors(cublasHgemm(
+          handle, transa, transb, m, n, k, (const half*)&alpha_h, ((const half *)A),
+          lda, ((const half *)B), ldb, (const half*)&beta_h, ((half *)C), ldc));
+      });
+    });
+  } else { 
+    sycl_queue.submit([&](sycl::handler &cgh) {
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {  
+        auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
+        cublasSetStream(handle, cudaStreamHandle);  
+        ReportCUBLASErrors(cublasSgemm(handle, transa, transb, m, n, k, &alpha,
+                                   (const float*)A, lda, (const float*)B, ldb,
+                                   &beta, (float*)C, ldc));
+
+        });
+      });
+  }
+  #elif defined(USE_HIPBLAS)
+  hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t();
+  if (fp16) {
+    unsigned short alpha_h = FP32toFP16(alpha);
+    unsigned short beta_h = FP32toFP16(beta);
+    sycl_queue.submit([&](sycl::handler &cgh) {
+      cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+        auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
+        hipblasSetStream(handle, hipStreamHandle);
+        hipblasHgemm(handle, transa, transb, m, n, k, &alpha_h, (const hipblasHalf*)A,
+          lda, (const hipblasHalf*)B, ldb, &beta_h, (hipblasHalf*)C, ldc);
+        });
+      });
+  } else {
+    sycl_queue.submit([&](sycl::handler &cgh) {
+      cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {  
+        auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
+        hipblasSetStream(handle, hipStreamHandle);  
+        hipblasSgemm(handle, transa, transb, m, n, k, &alpha, (const float*)A, lda, (const float*)B, ldb, &beta, (float*)C, ldc);
+        });
+      });
+  }
+  #else
+    oneapi::mkl::blas::column_major::gemm(sycl_queue, transa, transb, m, n, k, alpha, (const DataType *)A, lda,
+        (const DataType *)B, ldb, beta, (DataType *)C, ldc);
+  #endif
+
+}
+
+template <typename DataType>
+static void cublasXGemmStridedBatched(transpose_type transa, transpose_type transb,
+    int m, int n, int k, float alpha, const void* A, int lda,
+    long long int strideA, const void* B, int ldb, long long int strideB,
+    float beta, void* C, int ldc, long long int strideC, int batchCount, sycl::queue &sycl_queue) {
+
+  const bool fp16 = std::is_same<sycl::half, DataType>::value;
+  
+  #ifdef USE_CUBLAS
+  cublasHandle_t handle = cuBlasContextManager::getcuBlasHandle_t();
+  if (fp16) {
+    unsigned short alpha_h = FP32toFP16(alpha);
+    unsigned short beta_h = FP32toFP16(beta);
+    
+    sycl_queue.submit([&](sycl::handler &cgh) {
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+        auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
+        cublasSetStream(handle, cudaStreamHandle);    
+
+        ReportCUBLASErrors(cublasGemmStridedBatchedEx(
+          handle, transa, transb, m, n, k, &alpha_h, A, CUDA_R_16F, lda, strideA,
+          B, CUDA_R_16F, ldb, strideB, &beta_h, C, CUDA_R_16F, ldc, strideC,
+          batchCount, CUDA_R_16F, CUBLAS_GEMM_DEFAULT));
+        
+
+      });
+
+    });
+  
+  } else { 
+    
+    sycl_queue.submit([&](sycl::handler &cgh) {
+        
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+    
+        auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
+        cublasSetStream(handle, cudaStreamHandle);    
+    
+        ReportCUBLASErrors(cublasGemmStridedBatchedEx(
+        handle, transa, transb, m, n, k, &alpha, A, CUDA_R_32F, lda, strideA, B,
+        CUDA_R_32F, ldb, strideB, &beta, C, CUDA_R_32F, ldc, strideC,
+        batchCount, CUDA_R_32F, CUBLAS_GEMM_DEFAULT));
+  
+  
+      });
+    });
+  }
+  #elif defined(USE_HIPBLAS)
+  hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t();
+  if (fp16) {
+    unsigned short alpha_h = FP32toFP16(alpha);
+    unsigned short beta_h = FP32toFP16(beta);
+
+    sycl_queue.submit([&](sycl::handler &cgh) {
+
+      cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+        auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
+
+        hipblasSetStream(handle, hipStreamHandle);
+
+        hipblasGemmStridedBatchedEx(
+        handle, transa, transb, m, n, k, &alpha_h, A, HIPBLAS_R_16F, lda, strideA, B,
+        HIPBLAS_R_16F, ldb, strideB, &beta_h, C, HIPBLAS_R_16F, ldc, strideC,
+        batchCount, HIPBLAS_COMPUTE_16F, HIPBLAS_GEMM_DEFAULT);
+
+
+      });
+    });
+  } else {
+    sycl_queue.submit([&](sycl::handler &cgh) {
+
+      cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+        auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
+    
+        hipblasSetStream(handle, hipStreamHandle);    
+    
+        hipblasGemmStridedBatchedEx(
+        handle, transa, transb, m, n, k, &alpha, A, HIPBLAS_R_32F, lda, strideA, B,
+        HIPBLAS_R_32F, ldb, strideB, &beta, C, HIPBLAS_R_32F, ldc, strideC,
+        batchCount, HIPBLAS_COMPUTE_32F, HIPBLAS_GEMM_DEFAULT);
+  
+  
+      });
+    });
+  }
+  #else
+  oneapi::mkl::blas::column_major::gemm_batch(sycl_queue, transa, transb, m, n, k,  alpha, (const DataType *)A, lda, strideA, (const DataType *)B, ldb, strideB, beta, (DataType *)C, ldc, strideC, batchCount);
+  #endif
+}
+
+template <typename DataType>
+static void cublasXGemmBatched(transpose_type transa,
+                               transpose_type transb, int m, int n,
+                               int k, float alpha, DataType** A, int lda,
+                               DataType** B, int ldb, float beta, DataType** C,
+                               int ldc, int batchCount, sycl::queue &sycl_queue) {
+
+  const bool fp16 = std::is_same<sycl::half, DataType>::value;
+
+  #ifdef USE_CUBLAS
+  cublasHandle_t handle = cuBlasContextManager::getcuBlasHandle_t();
+  
+  if (fp16) {
+    unsigned short alpha_h = FP32toFP16(alpha);
+    unsigned short beta_h = FP32toFP16(beta);
+
+
+    sycl_queue.submit([&](sycl::handler &cgh) {
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+        auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
+        cublasSetStream(handle, cudaStreamHandle);    
+
+        ReportCUBLASErrors(cublasHgemmBatched(
+        handle, transa, transb, m, n, k, (const half*)&alpha_h, (half**)A, lda,
+        (half**)B, ldb, (const half*)&beta_h, (half**)C, ldc, batchCount));
+        
+      });
+
+    });
+
+  } else {
+    
+    sycl_queue.submit([&](sycl::handler &cgh) {
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+        auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
+        cublasSetStream(handle, cudaStreamHandle);    
+
+        ReportCUBLASErrors(cublasSgemmBatched(
+        handle, transa, transb, m, n, k, &alpha, (float**)A, lda, (float**)B,
+        ldb, &beta, (float**)C, ldc, batchCount));
+        
+      });
+
+    });
+  }
+
+  #elif defined(USE_HIPBLAS)
+
+   hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t();
+  
+  if (fp16) {
+    unsigned short alpha_h = FP32toFP16(alpha);
+    unsigned short beta_h = FP32toFP16(beta);
+
+
+    sycl_queue.submit([&](sycl::handler &cgh) {
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+        auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
+
+        hipblasSetStream(handle, hipStreamHandle);       
+
+        hipblasHgemmBatched(
+        handle, transa, transb, m, n, k, (const hipblasHalf*)&alpha_h, (hipblasHalf**)A, lda,
+        (hipblasHalf**)B, ldb, (const hipblasHalf*)&beta_h, (hipblasHalf**)C, ldc, batchCount);
+        
+      });
+
+    });
+
+  } else {
+    
+    sycl_queue.submit([&](sycl::handler &cgh) {
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+        auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
+
+        hipblasSetStream(handle, hipStreamHandle);        
+
+        hipblasSgemmBatched(
+        handle, transa, transb, m, n, k, &alpha, (float**)A, lda, (float**)B,
+        ldb, &beta, (float**)C, ldc, batchCount);
+        
+
+      });
+
+    });
+  }
+
+  #else
+  if (fp16) {
+    unsigned short alpha_h = FP32toFP16(alpha);
+    unsigned short beta_h = FP32toFP16(beta);
+
+    oneapi::mkl::blas::column_major::gemm_batch(
+        sycl_queue, &transa, &transb, &m, &n, &k,  (const sycl::half*)&alpha_h,
+        (const sycl::half **)A, &lda, (const sycl::half **)B, &ldb,
+        (const sycl::half*)&beta_h, (sycl::half **)C, &ldc, 1, &batchCount);
+  } else {
+    oneapi::mkl::blas::column_major::gemm_batch(
+        sycl_queue, &transa, &transb, &m, &n, &k,  &alpha, (const float **)A,
+        &lda, (const float **)B, &ldb, &beta, (float **)C, &ldc, 1,
+        &batchCount);
+  }
+
+  #endif
+}
+
+// input/output tensor is in_out_tensor, others are used as scratch.
+template <typename DataType>
+void EncoderBlock<DataType>::Eval(int N, DataType* in_out_tensor,
+                                  DataType* scratch, DataType* buffer1,
+                                  DataType* buffer2, sycl::queue &sycl_queue,
+                                  DataType*** offset_pointers) {
+
+  //CERR << "EncoderBlock<DataType>::Eval. ";
+
+  const int d_model = mha_q_size_;
+  const int depth = d_model / encoder_heads_;
+
+  // Calculate smolgen weights. Do this first so we can make use of
+  // scratch, buffer1 and buffer2.
+  if (has_smolgen_) {
+    {
+      // Compress.
+      // input shape: N, 64, d_model
+      // output shape: N, 64, hidden_channels
+      const int num_inputs = d_model;
+      const int num_outputs = smol_compress_size_;
+      const int batch = N * 64;
+      cublasXgemm<DataType>(transpose_type_transpose,
+          transpose_type_notranspose, num_outputs, batch, num_inputs,
+          1.0f, (const DataType*)smol_compress, num_inputs, in_out_tensor,
+          num_inputs, 0.0f, scratch, num_outputs, sycl_queue);
+    }
+
+    {
+      // Hidden 1 dense.
+      // input shape: N, 64 * hidden_channels
+      // output shape: N, hidden_sz
+      const int num_inputs = 64 * smol_compress_size_;
+      const int num_outputs = smol_dense_1_size_;
+      const int batch = N;
+      cublasXgemm<DataType>(transpose_type_transpose,
+                            transpose_type_notranspose, num_outputs,
+                            batch, num_inputs, 1.0f,
+                            (const DataType*)smol_dense1_w, num_inputs, scratch,
+                            num_inputs, 0.0f, buffer1, num_outputs, sycl_queue);
+
+      LayerNorm<DataType>(batch, num_outputs, scratch, buffer1, smol_dense1_b,
+                          (DataType*)nullptr, smol_ln1_gammas, smol_ln1_betas,
+                          1e-3, 1.0, smolgen_activation_, sycl_queue);
+    }
+
+    {
+      // Hidden 2 dense (gen_from)
+      // input shape: N, hidden_sz
+      // output shape: N, heads * gen_sz
+      const int num_inputs = smol_dense_1_size_;
+      const int num_outputs = smol_dense_2_size_;
+      const int batch = N;
+      cublasXgemm<DataType>(transpose_type_transpose,
+                            transpose_type_notranspose, num_outputs,
+                            batch, num_inputs, 1.0f,
+                            (const DataType*)smol_dense2_w, num_inputs, scratch,
+                            num_inputs, 0.0f, buffer1, num_outputs, sycl_queue);
+
+      LayerNorm<DataType>(batch, num_outputs, scratch, buffer1, smol_dense2_b,
+                          (DataType*)nullptr, smol_ln2_gammas, smol_ln2_betas,
+                          1e-3, 1.0, smolgen_activation_, sycl_queue);
+    }
+
+    {
+      // Final smolgen weights generation.
+      /*
+        gen_from = tf.reshape(gen_from, [-1, heads, gen_sz])
+        out = self.smol_weight_gen_dense(gen_from)
+      */
+      const int num_inputs =
+          smol_dense_2_size_ / encoder_heads_; /* num_inputs == gen_sz == 256 */
+      const int num_outputs = smol_global_size_; /* hwhw: 64 * 64 */
+      const int batch = N * encoder_heads_;
+
+      cublasXgemm<DataType>(transpose_type_transpose,
+                            transpose_type_notranspose, num_outputs,
+                            batch, num_inputs, 1.0f,
+                            (const DataType*)smol_global, num_inputs, scratch,
+                            num_inputs, 0.0f, buffer2, num_outputs, sycl_queue);
+    }
+  }
+
+  DataType* mha_q;
+  DataType* mha_k;
+  DataType* mha_v;
+
+  {
+    const int num_inputs = embedding_op_size_;
+    const int num_outputs = d_model;
+    const int batch = N * 64;
+    const int max_batch = max_batch_size_ * 64;
+
+    mha_q = scratch;
+    mha_k = mha_q + num_outputs * max_batch;
+    mha_v = mha_k + num_outputs * max_batch;
+
+    cublasXGemmStridedBatched<DataType>(transpose_type_transpose, transpose_type_notranspose,
+        num_outputs, batch, num_inputs, 1.0f, mha_qkv_w, num_inputs,
+        num_inputs * num_outputs, in_out_tensor, num_inputs, 0, 0.0f, mha_q,
+        num_outputs, num_outputs * max_batch, 3, sycl_queue);
+    addBiasBatched<DataType>(mha_q, mha_q, mha_qkv_b, 3, batch, num_outputs,
+                             max_batch, ACTIVATION_NONE, sycl_queue);
+  }
+
+  // Apply split_heads() to q, k and v
+  // which basically transposes (batch_size, 64, num_heads, depth)
+  // to (batch_size, num_heads, 64, depth)
+  // Do we really need to transpose here?
+  // (Maybe not, we can play with strides of the gemm and do independent gemms
+  // for each encoder head)
+
+  // Apply scaled dot product attention:
+  /*
+      matmul_qk = tf.matmul(q, k, transpose_b=True)
+      dk = tf.cast(tf.shape(k)[-1], self.model_dtype)
+      scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
+      attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
+      output = tf.matmul(attention_weights, v)
+  */
+
+  // shape(k)[-1] = depth
+  float factor = 1.0f / sqrt((float)depth);
+
+  // matmul_qk = tf.matmul(q, k, transpose_b=True)
+  {
+    if (*offset_pointers == nullptr) {
+      
+      *offset_pointers = sycl::malloc_device<DataType*>(
+                               encoder_heads_ * max_batch_size_ * 5,
+                               sycl_queue_);
+      genOffsetPointers(*offset_pointers, encoder_heads_, max_batch_size_,
+                        depth, d_model, mha_k, mha_q, buffer1,
+                        mha_v, buffer2, sycl_queue_);
+    }
+
+    cublasXGemmBatched<DataType>(transpose_type_transpose, transpose_type_notranspose,
+        64 /*M*/, 64 /*N*/, depth /*K*/,  // A/B, and M/N are swapped for
+                                          // row-major to col-major transform
+        factor,            // to handle "/ tf.math.sqrt(dk)"
+        *offset_pointers,  // mha_k + offset /*A*/,
+        d_model /*LDA*/,   // (d_model = depth * encoder_heads_) to skip over
+                           // other "depth" slices / heads
+        // 64 * d_model,     /*strideA*/
+        *offset_pointers +
+            encoder_heads_ * max_batch_size_,  // mha_q + offset /*B*/,
+        d_model /*LDB*/,  // to skip over other other "depth" slices / heads
+        // 64 * d_model,     /*strideB*/
+        0.0f,
+        *offset_pointers + encoder_heads_ * max_batch_size_ *
+                               2,  // buffer1 + outOffset /*C*/,  // output
+                                   // (matmul_qk) goes to buffer1
+        64 /*LDC*/,
+        // 64 * 64 /*strideC*/,
+        N * encoder_heads_, sycl_queue_);
+  }
+
+  // attention_weights = tf.nn.softmax(scaled_attention_logits, axis = -1)
+  // attention_weights -> buffer1
+  if (has_smolgen_) {
+    // Add smolgen weights to the scaled matmul_qk attention logits before
+    // softmax.
+    Softmax(encoder_heads_ * N * 64, 64, buffer1, buffer1, buffer2, sycl_queue_);
+  } else {
+    Softmax(encoder_heads_ * N * 64, 64, buffer1, buffer1,
+            (const DataType*)nullptr, sycl_queue_);
+  }
+
+  {
+    cublasXGemmBatched<DataType>(transpose_type_notranspose,
+        transpose_type_notranspose, depth /*M*/, 64 /*N*/, 64 /*K*/, 1.0f,
+        *offset_pointers + encoder_heads_ * max_batch_size_ *
+                               3,  // mha_v + offset /*A*/,  // "v" matrix
+        d_model /*LDA*/,           // to skip over other "depth" slices / heads
+        // 64 * d_model,          /*strideA*/
+        *offset_pointers + encoder_heads_ * max_batch_size_ *
+                               2,  // buffer1 + weightsOffset /*B*/,
+        64 /*LDB*/,                // 64 * 64, /*strideB*/
+        0.0f,
+        *offset_pointers +
+            encoder_heads_ * max_batch_size_ *
+                4,  // buffer2 + offset /*C*/,  // output goes to buffer2
+        d_model /*LDC*/,
+        // 64 * d_model /*strideC*/,
+        N * encoder_heads_, sycl_queue_);
+  }
+
+  // #final dense layer (mha_dense), buffer2 -> buffer1
+  {
+    const int num_inputs = d_model;
+    const int num_outputs = embedding_op_size_;
+    const int batch = N * 64;
+    cublasXgemm(transpose_type_transpose,
+                transpose_type_notranspose, num_outputs, batch,
+                num_inputs, 1.0f, (const DataType*)mha_dense_w, num_inputs,
+                buffer2, num_inputs, 0.0f, buffer1, num_outputs, sycl_queue_);
+  }
+
+  // LN1: skip connection and layer normalization (also bias add of prev gemm)
+  // buffer1/in_out_tensor -> scratch
+  LayerNorm<DataType>(N * 64, embedding_op_size_, scratch, buffer1, mha_dense_b,
+                      in_out_tensor, ln1_gammas, ln1_betas, default_eps_,
+                      alpha_, ACTIVATION_NONE, sycl_queue_);
+
+  // #FFN dense 1, scratch -> in_out_tensor
+  {
+    const int num_inputs = embedding_op_size_;
+    const int num_outputs = ffn_dense1_size_;  // encoder_dff
+    const int batch = N * 64;
+    cublasXgemm(transpose_type_transpose,
+                transpose_type_notranspose, num_outputs, batch,
+                num_inputs, 1.0f, (const DataType*)ffn_dense1_w, num_inputs,
+                scratch, num_inputs, 0.0f, in_out_tensor, num_outputs, sycl_queue_);
+    addBiasBatched(in_out_tensor, in_out_tensor, ffn_dense1_b, 1, batch,
+                   num_outputs, ffn_activation_, sycl_queue_);
+  }
+
+  // #FFN dense 2, in_out_tensor -> buffer1
+  {
+    const int num_inputs = ffn_dense1_size_;  // encoder_dff
+    const int num_outputs = embedding_op_size_;
+    const int batch = N * 64;
+    cublasXgemm(transpose_type_transpose,
+                transpose_type_notranspose, num_outputs, batch,
+                num_inputs, 1.0f, (const DataType*)ffn_dense2_w, num_inputs,
+                in_out_tensor, num_inputs, 0.0f, buffer1, num_outputs, sycl_queue_);
+  }
+
+  // LN2: skip connection and layer normilization (also bias add of prev gemm)
+  // buffer1/scratch -> in_out_tensor
+  LayerNorm<DataType>(N * 64, embedding_op_size_, in_out_tensor, buffer1,
+                      ffn_dense2_b, scratch, ln2_gammas, ln2_betas,
+                      default_eps_, alpha_, ACTIVATION_NONE, sycl_queue_);
+}
+
+template <typename DataType>
+void AttentionPolicyHead<DataType>::Eval(int N, DataType* output, const DataType* input,
+            const DataType* input2, void* scratch, size_t scratch_size,
+            sycl::queue &sycl_queue, DataType*** offset_pointers) {
+
+  //CERR << "AttentionPolicyHead<DataType>::Eval. ";
+
+  DataType* input2_tensor = (DataType*)input2;
+  DataType* buffer1 = output + scratch_size / (2 * sizeof(DataType));
+  DataType* buffer2 = input2_tensor + scratch_size / (2 * sizeof(DataType));
+
+  int inputC = this->input_->GetC();
+  if (!attention_body_)
+    convertNCHWtoNHWC((DataType*)scratch, input, N, inputC, N, inputC, 8, 8, sycl_queue);
+
+  // 1. Policy embedding (fully connected layer)
+  // Input data in NHWC layout N*(64)*C, output is N*(64)*embedding_op_size_
+  DataType* pol_embedding = input2_tensor;
+  {
+    const int num_outputs = embedding_op_size_;
+    const int num_inputs = inputC;
+    const int batch = N * 64;
+    cublasXgemm<DataType>(
+        transpose_type_transpose, transpose_type_notranspose,
+        num_outputs, batch, num_inputs, 1.0f, (const DataType*)ip_pol_w_,
+        num_inputs, attention_body_ ? input : (DataType*)scratch, num_inputs,
+        0.0f, pol_embedding, num_outputs, sycl_queue);
+
+    addBiasBatched(pol_embedding, pol_embedding, ip_pol_b_, 1, batch,
+                   num_outputs, act_, sycl_queue);
+  }
+
+  // 2. Encoder layers
+  for (const auto pEnc : encoder_weights_) {
+    pEnc->Eval(N, input2_tensor, (DataType*)scratch, buffer1, buffer2, sycl_queue, offset_pointers);
+  }  // End of encoder blocks
+
+  DataType* wq;
+  DataType* wk;
+  {
+    const int num_inputs = embedding_op_size_;
+    const int num_outputs = policy_d_model_;
+    const int batch = N * 64;
+    wq = (DataType*)scratch;
+    wk = wq + num_outputs * batch;
+
+    cublasXGemmStridedBatched<DataType>(
+        transpose_type_transpose, transpose_type_notranspose,
+        num_outputs, batch, num_inputs, 1.0f, wqk_w_, num_inputs,
+        num_inputs * num_outputs, input2_tensor, num_inputs, 0, 0.0f, wq,
+        num_outputs, num_outputs * batch, 2, sycl_queue);
+
+    addBiasBatched<DataType>(wq, wq, wqk_b_, 2, batch, num_outputs,
+                             ACTIVATION_NONE, sycl_queue);
+  }
+
+  // dk = tf.math.sqrt(tf.cast(tf.shape(keys)[-1], self.model_dtype))
+  // policy matmul_qk = tf.matmul(queries, keys, transpose_b=True)
+  // policy_attn_logits = matmul_qk / dk
+  {
+    // shape(keys)[-1] = policy_d_model_
+    float factor = 1.0f / sqrt((float)policy_d_model_);
+
+    // A/B, and M/N are swapped for row-major to col-major transform
+    // leave 8*24 after each batch to interleave promotion_logits (computed
+    // later below)
+    cublasXGemmStridedBatched<DataType>(
+        transpose_type_transpose, transpose_type_notranspose,
+        64 /*M*/, 64 /*N*/, policy_d_model_ /*K*/,
+        factor,  // to handle "/ tf.math.sqrt(dk)"
+        wk /*A*/, policy_d_model_ /*LDA*/, 64 * policy_d_model_, /*strideA*/
+        wq /*B*/, policy_d_model_ /*LDB*/, 64 * policy_d_model_, /*strideB*/
+        0.0f, output /*C*/,  // output (policy_attn_logits)
+        64 /*LDC*/, 64 * 64 + 8 * 24 /*strideC*/, N, sycl_queue);
+  }
+
+  // Compute promotion_logits in a single kernel (and put the result just after
+  // policy_attn_logits interleaved to get concat for free)
+  DataType* promotion_logits = output + 64 * 64;
+
+  ComputePromotionLogits<DataType>(N, policy_d_model_, promotion_logits, wk,
+                                   ip4_pol_w_, output, sycl_queue);
+}
+
+template <typename DataType>
+AttentionPolicyHead<DataType>::~AttentionPolicyHead() {
+      sycl::free(ip_pol_w_, sycl_queue_);
+      sycl::free(ip_pol_b_, sycl_queue_);
+      sycl::free(ip2_pol_w_, sycl_queue_);
+      sycl::free(ip2_pol_b_, sycl_queue_);
+      sycl::free(ip3_pol_w_, sycl_queue_);
+      sycl::free(ip3_pol_b_, sycl_queue_);
+      sycl::free(ip4_pol_w_, sycl_queue_);
+      sycl::free(wqk_w_, sycl_queue_);
+      sycl::free(wqk_b_, sycl_queue_);
+  for (const auto pEnc : encoder_weights_) delete pEnc;
+}
+
+template <typename DataType>
+EncoderBlock<DataType>::~EncoderBlock() {
+      sycl::free(mha_q_w, sycl_queue_);
+      sycl::free(mha_q_b, sycl_queue_);
+      sycl::free(mha_k_w, sycl_queue_);
+      sycl::free(mha_k_b, sycl_queue_);
+      sycl::free(mha_v_w, sycl_queue_);
+      sycl::free(mha_v_b, sycl_queue_);
+      sycl::free(mha_qkv_w, sycl_queue_);
+      sycl::free(mha_qkv_b, sycl_queue_);
+      sycl::free(mha_dense_w, sycl_queue_);
+      sycl::free(mha_dense_b, sycl_queue_);
+      sycl::free(ln1_gammas, sycl_queue_);
+      sycl::free(ln1_betas, sycl_queue_);
+      sycl::free(ffn_dense1_w, sycl_queue_);
+      sycl::free(ffn_dense1_b, sycl_queue_);
+      sycl::free(ffn_dense2_w, sycl_queue_);
+      sycl::free(ffn_dense2_b, sycl_queue_);
+      sycl::free(ln2_gammas, sycl_queue_);
+      sycl::free(ln2_betas, sycl_queue_);
+  if (has_smolgen_) {
+      sycl::free(smol_compress, sycl_queue_);
+      sycl::free(smol_dense1_w, sycl_queue_);
+      sycl::free(smol_dense1_b, sycl_queue_);
+      sycl::free(smol_dense2_w, sycl_queue_);
+      sycl::free(smol_dense2_b, sycl_queue_);
+      sycl::free(smol_ln1_gammas, sycl_queue_);
+      sycl::free(smol_ln1_betas, sycl_queue_);
+      sycl::free(smol_ln2_gammas, sycl_queue_);
+      sycl::free(smol_ln2_betas, sycl_queue_);
+  }
+}
+
+template <typename DataType>
+EmbeddingLayer<DataType>::EmbeddingLayer(BaseLayer<DataType>* ip,
+                                         const std::vector<float>& weights,
+                                         const std::vector<float>& biases,
+                                         void* scratch, ActivationFunction act,
+                                         sycl::queue &sycl_queue)
+    : BaseLayer<DataType>(biases.size(), 8, 8, ip, sycl_queue), act_(act) {
+  allocAndUpload<DataType>(&weights_, weights, scratch, sycl_queue_);
+  allocAndUpload<DataType>(&biases_, biases, scratch, sycl_queue_);
+}
+
+template <typename DataType>
+EmbeddingLayer<DataType>::~EmbeddingLayer() {
+    sycl::free(weights_, sycl_queue_);
+    sycl::free(biases_, sycl_queue_);
+}
+
+template <typename DataType>
+void EmbeddingLayer<DataType>::Eval(
+    int N, DataType* output, const DataType* input, const DataType* /*input2*/,
+    void* /*scratch*/, size_t /*scratch_size*/, sycl::queue &sycl_queue, DataType***) {
+
+  
+  //CERR << "EmbeddingLayer<DataType>::Eval. ";
+
+  const int num_outputs = this->GetC();
+  const int num_inputs = this->input_->GetC();
+  const int batch = N * 64;
+  cublasXgemm<DataType>(transpose_type_transpose, transpose_type_notranspose, num_outputs, batch,
+                        num_inputs, 1.0f, weights_, num_inputs, input,
+                        num_inputs, 0.0f, output, num_outputs, sycl_queue);
+  addBiasBatched(output, output, biases_, 1, batch, num_outputs, act_, sycl_queue);
+}
+
+template <typename DataType>
+AttentionBody<DataType>::AttentionBody(const MultiHeadWeights& weights,
+                                       void* scratch, Activations activations,
+                                       int num_res_blocks, int input_c,
+                                       int max_batch_size,
+                                       bool is_pe_dense_embedding,
+                                       sycl::queue &sycl_queue)
+    : BaseLayer<DataType>(weights.ip_emb_b.size(), 8, 8, nullptr, sycl_queue),
+      embedding_op_size_(weights.ip_emb_b.size()),
+      encoder_head_count_(weights.encoder_head_count),
+      activations_(activations),
+      num_resi_blocks_(num_res_blocks),
+      input_c_(input_c),
+      has_gating_(weights.ip_mult_gate.size() > 0 &&
+                  weights.ip_add_gate.size() > 0),
+      has_smolgen_(weights.has_smolgen),
+      is_pe_dense_embedding_(is_pe_dense_embedding) {
+  allocAndUpload<DataType>(&ip_emb_w_, weights.ip_emb_w, scratch, sycl_queue_);
+  allocAndUpload<DataType>(&ip_emb_b_, weights.ip_emb_b, scratch, sycl_queue_);
+
+  if (is_pe_dense_embedding_) {
+    allocAndUpload<DataType>(&ip_emb_pre_w_, weights.ip_emb_preproc_w, scratch,
+                             sycl_queue_);
+    allocAndUpload<DataType>(&ip_emb_pre_b_, weights.ip_emb_preproc_b, scratch,
+                             sycl_queue_);
+
+    allocAndUpload<DataType>(&ip_emb_ln_g_, weights.ip_emb_ln_gammas, scratch,
+                             sycl_queue_);
+    allocAndUpload<DataType>(&ip_emb_ln_b_, weights.ip_emb_ln_betas, scratch,
+                             sycl_queue_);
+
+    allocAndUpload<DataType>(&ip_emb_ffn_d1_w_, weights.ip_emb_ffn.dense1_w,
+                             scratch, sycl_queue_);
+    allocAndUpload<DataType>(&ip_emb_ffn_d1_b_, weights.ip_emb_ffn.dense1_b,
+                             scratch, sycl_queue_);
+
+    allocAndUpload<DataType>(&ip_emb_ffn_d2_w_, weights.ip_emb_ffn.dense2_w,
+                             scratch, sycl_queue_);
+    allocAndUpload<DataType>(&ip_emb_ffn_d2_b_, weights.ip_emb_ffn.dense2_b,
+                             scratch, sycl_queue_);
+
+    allocAndUpload<DataType>(&ip_emb_ffn_ln_g_, weights.ip_emb_ffn_ln_gammas,
+                             scratch, sycl_queue_);
+    allocAndUpload<DataType>(&ip_emb_ffn_ln_b_, weights.ip_emb_ffn_ln_betas,
+                             scratch, sycl_queue_);
+
+    // 12 is the number of input channels used for the input encoding.
+    embedding_dense_size_ = weights.ip_emb_preproc_b.size() / 64;
+    embedding_ffn_size_ = weights.ip_emb_ffn.dense2_b.size();
+    embedding_ffn_dff_ = weights.ip_emb_ffn.dense1_b.size();
+  } else {
+    size_t size = 64 * kNumPosEncodingChannels * sizeof(float);
+    pos_encoding_ = (DataType *)sycl::malloc_device(size, sycl_queue_);
+    sycl_queue_.memcpy(scratch, kPosEncoding, size);
+    copyTypeConverted(pos_encoding_, (float*)scratch, size, sycl_queue_);
+  }
+
+  if (has_gating_) {
+    allocAndUpload<DataType>(&ip_mult_gate_, weights.ip_mult_gate, scratch, sycl_queue_);
+    allocAndUpload<DataType>(&ip_add_gate_, weights.ip_add_gate, scratch, sycl_queue_);
+  }
+
+  if (has_smolgen_) {
+    allocAndUpload<DataType>(&smolgen_global_, weights.smolgen_w, scratch, sycl_queue_);
+    smolgen_global_size_ = 64 * 64;
+  }
+
+  int num_encoders = weights.encoder.size();
+  float alpha = (float)pow(2.0 * num_encoders, -0.25);
+  for (const auto& enc : weights.encoder) {
+    EncoderBlock<DataType>* pW = new EncoderBlock<DataType>(
+        enc, scratch, encoder_head_count_, embedding_op_size_, alpha,
+        smolgen_global_, smolgen_global_size_, max_batch_size,
+        activations_.smolgen_activation, activations_.ffn_activation,
+        is_pe_dense_embedding_ ? 1e-3 : 1e-6, sycl_queue_);
+
+    encoder_weights_.emplace_back(pW);
+  }
+}
+
+template <typename DataType>
+AttentionBody<DataType>::~AttentionBody() {
+  sycl::free(ip_emb_w_, sycl_queue_);
+  sycl::free(ip_emb_b_, sycl_queue_);
+  if (is_pe_dense_embedding_) {
+    sycl::free(ip_emb_pre_w_, sycl_queue_);
+    sycl::free(ip_emb_pre_b_, sycl_queue_);
+    sycl::free(ip_emb_ln_g_, sycl_queue_);
+    sycl::free(ip_emb_ln_b_, sycl_queue_);
+    sycl::free(ip_emb_ffn_d1_w_, sycl_queue_);
+    sycl::free(ip_emb_ffn_d1_b_, sycl_queue_);
+    sycl::free(ip_emb_ffn_d2_w_, sycl_queue_);
+    sycl::free(ip_emb_ffn_d2_b_, sycl_queue_);
+    sycl::free(ip_emb_ffn_ln_g_, sycl_queue_);
+    sycl::free(ip_emb_ffn_ln_b_, sycl_queue_);
+  } else {
+    sycl::free(pos_encoding_, sycl_queue_);
+  }
+
+  if (has_gating_) {
+    sycl::free(ip_mult_gate_, sycl_queue_);
+    sycl::free(ip_add_gate_, sycl_queue_);
+  }
+  if (has_smolgen_) {
+    sycl::free(smolgen_global_, sycl_queue_);
+  }
+  for (const auto pEnc : encoder_weights_) delete pEnc;
+}
+
+template <typename DataType>
+void AttentionBody<DataType>::Eval(int N, DataType* output,
+                                   const DataType* input,
+                                   const DataType* input2, void* scratch,
+                                   size_t scratch_size, 
+                                   sycl::queue &sycl_queue,
+                                   DataType*** offset_pointers) {
+
+  //CERR << "AttentionBody<DataType>::Eval. ";
+
+  DataType* output_tensor = (DataType*)output;
+  DataType* buffer1 = (DataType*)input2;
+  DataType* buffer2 = buffer1 + scratch_size / (2 * sizeof(DataType));
+
+  int inputC = input_c_;
+  if (num_resi_blocks_ == 0) {
+    assert(inputC == kInputPlanes);
+    /*
+      # if there are no residual blocks (pure transformer), do some input
+      processing
+    */
+    if (is_pe_dense_embedding_) {
+      // New encoding is made of dense layer fed with input from a 12-channel
+      // slice of the input tensor.
+      // pos_info = flow[..., :12]
+      // pos_info_flat = tf.reshape(pos_info, [-1, 64 * 12])
+      // pos_info_processed = tf.keras.layers.Dense(64*self.embedding_dense_sz,
+      //                                            name=name+"embedding/preprocess")(pos_info_flat)
+      const int num_outputs = 64 * embedding_dense_size_;
+      const int num_inputs = 64 * 12;
+      const int batch = N;
+
+      convertNCHWtoNHWC((DataType*)scratch, input, N, inputC, N, 12, 8, 8, sycl_queue);
+      cublasXgemm<DataType>(
+          transpose_type_transpose, transpose_type_notranspose, num_outputs, batch, num_inputs,
+          1.0f, (const DataType*)ip_emb_pre_w_, num_inputs,
+          (const DataType*)scratch, num_inputs, 0.0f, buffer1, num_outputs, sycl_queue);
+
+      // addBiasBatched(buffer1, buffer1, ip_emb_pre_b_, batch, N, num_outputs,
+      //               ACTIVATION_NONE, sycl_queue);
+      const int size = num_outputs * N;
+      // @todo addBiasBatched has a 4096 channel limit, needs refactoring.
+      addVectors(buffer1, buffer1, ip_emb_pre_b_, size, size, num_outputs,
+                 ACTIVATION_NONE, sycl_queue);
+      inputPreprocessForAttentionBody((DataType*)scratch, input, buffer1, N,
+                                      kInputPlanes, embedding_dense_size_, true,
+                                      sycl_queue);
+      inputC += embedding_dense_size_;
+    } else {
+      /*
+      flow = tf.transpose(inputs, perm=[0, 2, 3, 1])
+      flow = tf.reshape(flow, [-1, 64, tf.shape(inputs)[1]])
+      # add positional encoding for each square to the input
+      positional_encoding = tf.broadcast_to(tf.convert_to_tensor(self.POS_ENC,
+      dtype=self.model_dtype), [tf.shape(flow)[0], 64,
+      tf.shape(self.POS_ENC)[2]]) flow = tf.concat([flow, positional_encoding],
+      axis=2)
+      */
+      inputPreprocessForAttentionBody((DataType*)scratch, input, pos_encoding_,
+                                      N, kInputPlanes, kNumPosEncodingChannels,
+                                      false, sycl_queue);
+      inputC += kNumPosEncodingChannels;
+    }
+  } else {
+    // #redirect flow through encoder blocks
+    // flow = tf.transpose(flow, perm = [ 0, 2, 3, 1 ])
+    // flow = tf.reshape(flow, [ -1, 64, self.RESIDUAL_FILTERS ])
+    convertNCHWtoNHWC((DataType*)scratch, input, N, inputC, N, inputC, 8, 8, sycl_queue);
+  }
+
+  if (is_pe_dense_embedding_) {
+    // 1. square embedding (fully connected layer)
+    // Input data in NHWC layout N*(64)*C, output is N*(64)*embedding_op_size_
+    DataType* embedding = output_tensor;
+    DataType* temp = (DataType*)scratch;
+    {
+      const int num_outputs = embedding_op_size_;
+      const int num_inputs = inputC;
+      const int batch = N * 64;
+      cublasXgemm<DataType>(transpose_type_transpose, transpose_type_notranspose, num_outputs,
+                            batch, num_inputs, 1.0f, (const DataType*)ip_emb_w_,
+                            num_inputs, temp, num_inputs, 0.0f, embedding,
+                            num_outputs, sycl_queue);
+      // embedding layer norm with fused in bias add of previous gemm.
+      LayerNorm<DataType>(N * 64, embedding_op_size_, temp, embedding,
+                          ip_emb_b_, (DataType*)nullptr, ip_emb_ln_g_,
+                          ip_emb_ln_b_, 1e-3, 1.0,
+                          activations_.default_activation, sycl_queue);
+    }
+
+    // Input gating
+    if (has_gating_) {
+      applyInputGating<DataType>(temp, temp, ip_mult_gate_, ip_add_gate_, N, 64,
+                                 embedding_op_size_, sycl_queue);
+    }
+
+    // embedding FFN dense 1
+    {
+      const int num_inputs = embedding_ffn_size_;
+      const int num_outputs = embedding_ffn_dff_;  // encoder_dff
+      const int batch = N * 64;
+      cublasXgemm(transpose_type_transpose, transpose_type_notranspose, num_outputs, batch,
+                  num_inputs, 1.0f, (const DataType*)ip_emb_ffn_d1_w_,
+                  num_inputs, temp, num_inputs, 0.0f, buffer1, num_outputs, sycl_queue);
+      addBiasBatched(buffer1, buffer1, ip_emb_ffn_d1_b_, 1, batch, num_outputs,
+                     activations_.ffn_activation, sycl_queue);
+    }
+
+    // embedding FFN dense 2
+    {
+      const int num_inputs = embedding_ffn_dff_;  // encoder_dff
+      const int num_outputs = embedding_ffn_size_;
+      const int batch = N * 64;
+      cublasXgemm(transpose_type_transpose, transpose_type_notranspose, num_outputs, batch,
+                  num_inputs, 1.0f, (const DataType*)ip_emb_ffn_d2_w_,
+                  num_inputs, buffer1, num_inputs, 0.0f, buffer2, num_outputs, sycl_queue);
+      // Embedding LN: skip connection and layer normilization (also bias add of
+      // prev gemm) buffer2 -> embedding
+      float alpha = (float)pow(2. * encoder_weights_.size(), -0.25);
+      LayerNorm<DataType>(N * 64, embedding_ffn_size_, embedding, buffer2,
+                          ip_emb_ffn_d2_b_, temp, ip_emb_ffn_ln_g_,
+                          ip_emb_ffn_ln_b_, 1e-3, alpha, ACTIVATION_NONE,
+                          sycl_queue);
+    }
+
+  } else {
+    // 1. square embedding (fully connected layer)
+    // Input data in NHWC layout N*(64)*C, output is N*(64)*embedding_op_size_
+    DataType* embedding = output_tensor;
+    {
+      const int num_outputs = embedding_op_size_;
+      const int num_inputs = inputC;
+      const int batch = N * 64;
+      cublasXgemm<DataType>(transpose_type_transpose, transpose_type_notranspose, num_outputs,
+                            batch, num_inputs, 1.0f, (const DataType*)ip_emb_w_,
+                            num_inputs, (DataType*)scratch, num_inputs, 0.0f,
+                            embedding, num_outputs, sycl_queue);
+      addBiasBatched(embedding, embedding, ip_emb_b_, 1, batch, num_outputs,
+                     activations_.default_activation, sycl_queue);
+    }
+    // Input gating
+    if (has_gating_) {
+      applyInputGating<DataType>(embedding, embedding, ip_mult_gate_,
+                                 ip_add_gate_, N, 64, embedding_op_size_,
+                                 sycl_queue);
+    }
+  }
+
+  // 2. Encoder blocks
+  for (const auto pEnc : encoder_weights_) {
+    pEnc->Eval(N, output_tensor, (DataType*)scratch, buffer1, buffer2, sycl_queue, offset_pointers);
+  }  // End of encoder blocks
+}
+
+template <typename DataType>
+ValueHead<DataType>::ValueHead(BaseLayer<DataType>* ip,
+                               const MultiHeadWeights::ValueHead& weights,
+                               void* scratch, bool attention_body, bool wdl,
+                               ActivationFunction act, int max_batch_size,
+                               sycl::queue &sycl_queue)
+    : BaseLayer<DataType>(weights.ip_val_b.size(), 8, 8, ip, sycl_queue),
+      attention_body_(attention_body),
+      embedding_size_(attention_body ? weights.ip_val_b.size()
+                                     : weights.value.biases.size()),
+      value_hidden_size_(weights.ip1_val_b.size()),
+      act_(act),
+      wdl_(wdl) {
+  if (attention_body_) {
+    allocAndUpload<DataType>(&ip_val_w_, weights.ip_val_w, scratch, sycl_queue);
+    allocAndUpload<DataType>(&ip_val_b_, weights.ip_val_b, scratch, sycl_queue);
+  } else {
+    conv_ = std::make_unique<Conv1Layer<DataType>>(
+        ip, weights.value.biases.size(), 8, 8, ip->GetC(), act, true,
+        sycl_queue);
+    conv_->LoadWeights((float*)&weights.value.weights[0],
+                       (float*)&weights.value.biases[0], scratch);
+  }
+
+  allocAndUpload<DataType>(&ip1_val_w_, weights.ip1_val_w, scratch, sycl_queue);
+  allocAndUpload<DataType>(&ip1_val_b_, weights.ip1_val_b, scratch, sycl_queue);
+
+  allocAndUpload<DataType>(&ip2_val_w_, weights.ip2_val_w, scratch, sycl_queue);
+  allocAndUpload<DataType>(&ip2_val_b_, weights.ip2_val_b, scratch, sycl_queue);
+}
+
+template <typename DataType>
+ValueHead<DataType>::~ValueHead() {
+  if (attention_body_) {
+    sycl::free(ip_val_w_, sycl_queue_);
+    sycl::free(ip_val_b_, sycl_queue_);
+  }
+  sycl::free(ip1_val_w_, sycl_queue_);
+  sycl::free(ip1_val_b_, sycl_queue_);
+  sycl::free(ip2_val_w_, sycl_queue_);
+  sycl::free(ip2_val_b_, sycl_queue_);
+}
+
+template <typename DataType>
+void ValueHead<DataType>::Eval(int N, DataType* output, const DataType* input,
+                               const DataType* input2, void* scratch,
+                               size_t scratch_size, sycl::queue &sycl_queue,
+                               DataType***) {
+  DataType* buffer = (DataType*)input2;
+  {
+    const int num_inputs = this->input_->GetC();
+    const int num_outputs = embedding_size_;
+    const int batch = N * 64;
+    if (attention_body_) {
+      cublasXgemm<DataType>(transpose_type_transpose, transpose_type_notranspose, num_outputs,
+                            batch, num_inputs, 1.0f, (const DataType*)ip_val_w_,
+                            num_inputs, input, num_inputs, 0.0f, buffer,
+                            num_outputs, sycl_queue);
+      addBiasBatched<DataType>(buffer, buffer, ip_val_b_, 1, batch, num_outputs,
+                               act_, sycl_queue);
+
+    } else {
+      conv_->Eval(N, buffer, input, nullptr, scratch, scratch_size, sycl_queue);
+    }
+  }
+
+  {
+    // Value dense 1
+    const int num_inputs = embedding_size_ * 64;
+    const int num_outputs = value_hidden_size_;
+    const int batch = N;
+    DataType* layer_out = (DataType*)scratch;
+    cublasXgemm<DataType>(transpose_type_transpose, transpose_type_notranspose, num_outputs, batch,
+                          num_inputs, 1.0f, (const DataType*)ip1_val_w_,
+                          num_inputs, buffer, num_inputs, 0.0f, layer_out,
+                          num_outputs, sycl_queue);
+    addBiasBatched<DataType>(layer_out, layer_out, ip1_val_b_, 1, batch,
+                             num_outputs, act_, sycl_queue);
+  }
+
+  {
+    // Value dense 2
+    const int num_inputs = value_hidden_size_;
+    const int num_outputs = wdl_ ? 3 : 1;
+    const int batch = N;
+    DataType* layer_out = (DataType*)output;
+    cublasXgemm<DataType>(transpose_type_transpose, transpose_type_notranspose, num_outputs, batch,
+                          num_inputs, 1.0f, (const DataType*)ip2_val_w_,
+                          num_inputs, (DataType*)scratch, num_inputs, 0.0f,
+                          layer_out, num_outputs, sycl_queue);
+    addVectors(layer_out, layer_out, ip2_val_b_, num_outputs * batch,
+               num_outputs * batch, num_outputs,
+               wdl_ ? ACTIVATION_NONE : ACTIVATION_TANH, sycl_queue);
+  }
+}
+
+// Template instantiation.
+template class FCLayer<sycl::half>;
+template class FCLayer<float>;
+
+template class SELayer<sycl::half>;
+template class SELayer<float>;
+
+template class PolicyMapLayer<sycl::half>;
+template class PolicyMapLayer<float>;
+
+template class FusedWinogradConvSELayer<sycl::half>;
+template class FusedWinogradConvSELayer<float>;
+
+template class Conv1Layer<sycl::half>;
+template class Conv1Layer<float>;
+
+template class ResidualBlock<sycl::half>;
+template class ResidualBlock<float>;
+
+template class AttentionPolicyHead<sycl::half>;
+template class AttentionPolicyHead<float>;
+
+template class EncoderBlock<sycl::half>;
+template class EncoderBlock<float>;
+
+template class AttentionBody<sycl::half>;
+template class AttentionBody<float>;
+
+template class EmbeddingLayer<sycl::half>;
+template class EmbeddingLayer<float>;
+
+template class ValueHead<sycl::half>;
+template class ValueHead<float>;
+
+#ifdef USE_CUBLAS
+// Misc error handling stuff.
+const char* CublasGetErrorString(int status) {
+  switch (status) {
+    case 0:
+      return "CUBLAS_STATUS_SUCCESS";
+    case 1:
+      return "CUBLAS_STATUS_NOT_INITIALIZED";
+    case 3:
+      return "CUBLAS_STATUS_ALLOC_FAILED";
+    case 7:
+      return "CUBLAS_STATUS_INVALID_VALUE";
+    case 8:
+      return "CUBLAS_STATUS_ARCH_MISMATCH";
+    case 11:
+      return "CUBLAS_STATUS_MAPPING_ERROR";
+    case 13:
+      return "CUBLAS_STATUS_EXECUTION_FAILED";
+    case 14:
+      return "CUBLAS_STATUS_INTERNAL_ERROR";
+    case 15:
+      return "CUBLAS_STATUS_NOT_SUPPORTED";
+    case 16:
+      return "CUBLAS_STATUS_LICENSE_ERROR";
+  }
+  return "unknown cublas error";
+}
+
+void CublasError(int status, const char* file, const int& line) {
+  if (status != 0) {
+    char message[128];
+    sprintf(message, "CUBLAS error: %s (%s:%d) ", CublasGetErrorString(status),
+            file, line);
+    throw Exception(message);
+  }
+}
+#endif
+
+}  // namespace sycldnn_backend
+}  // namespace lczero
diff --git a/src/neural/backends/sycl/layers.h b/src/neural/backends/sycl/layers.h
new file mode 100644
index 0000000000..850e29ecc0
--- /dev/null
+++ b/src/neural/backends/sycl/layers.h
@@ -0,0 +1,518 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2018-2024 The LCZero Authors
+  Copyright (C) 2023 Intel Corporation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+   
+  SPDX-License-Identifier:GNU General Public License v3.0 or later
+*/
+
+#pragma once
+
+#include <sycl/sycl.hpp>
+
+#include <cstddef>
+
+#include "sycl_common.h"
+#include "neural/backends/shared/activation.h"
+#include "neural/network_legacy.h"
+
+namespace lczero {
+namespace sycldnn_backend {
+
+// The Layer objects only hold memory for weights, biases, etc
+// memory for input and output tensors is provided by caller of Eval.
+
+template <typename DataType>
+class BaseLayer {
+ public:
+  int GetC() const { return C; }
+  int GetH() const { return H; }
+  int GetW() const { return W; }
+  sycl::queue GetSycl_Queue() { return sycl_queue_;}
+
+  bool isNHWC() const { return nhwc_; }
+
+  BaseLayer(int c, int h, int w, BaseLayer* ip, sycl::queue& sycl_queue);
+  BaseLayer(int c, int h, int w, BaseLayer* ip, bool nhwc, sycl::queue& sycl_queue);
+  virtual ~BaseLayer() = default;
+  size_t GetOutputSize(int N) const { return sizeof(DataType) * N * C * H * W; }
+
+  // Input2 is optional (skip connection).
+  //virtual void Eval(int N, DataType* output, const DataType* input,
+  //                  const DataType* input2, void* scratch, size_t scratch_size,
+  //                  cudnnHandle_t cudnn, dpct::queue_ptr cublas,
+  //                  dpct::queue_ptr stream, DataType*** = nullptr) = 0;
+
+  virtual void Eval(int N, DataType* output, const DataType* input,
+                    const DataType* input2, void* scratch, size_t scratch_size,
+                    sycl::queue &sycl_queue, DataType*** = nullptr) = 0;
+
+ protected:
+  BaseLayer* input_;
+  sycl::queue& sycl_queue_;
+
+  int C;  // Output tensor dimensions.
+  int H;
+  int W;
+
+  bool nhwc_;  // tensor layout
+
+  void cublasRowMajorMatrixMul(const DataType* A, const DataType* B,
+                               DataType* Out, int M, int N, int K,
+                               int batchSize, sycl::queue &sycl_queue);
+};
+
+template <typename DataType>
+class FCLayer : public BaseLayer<DataType> {
+  using BaseLayer<DataType>::nhwc_;
+  using BaseLayer<DataType>::sycl_queue_;
+
+ public:
+  FCLayer(BaseLayer<DataType>* ip, int C, int H, int W, bool bias,
+          ActivationFunction activation, sycl::queue &sycl_queue);
+  ~FCLayer();
+
+  void LoadWeights(float* cpuWeight, float* cpuBias, void* scratch);
+  //void Eval(int N, DataType* output, const DataType* input,
+  //          const DataType* input2, void* scratch, size_t scratch_size,
+  //          cudnnHandle_t cudnn, dpct::queue_ptr cublas, dpct::queue_ptr stream,
+  //          DataType*** = nullptr) override;
+
+  void Eval(int N, DataType* output, const DataType* input,
+            const DataType* input2, void* scratch, size_t scratch_size,
+            sycl::queue &sycl_queue, DataType*** = nullptr) override;
+
+ private:
+  const bool use_bias_;
+  const ActivationFunction act_;
+  DataType* weights_ = nullptr;
+  DataType* biases_ = nullptr;
+};
+
+template <typename DataType>
+class PolicyMapLayer : public BaseLayer<DataType> {
+  using BaseLayer<DataType>::nhwc_;
+  using BaseLayer<DataType>::sycl_queue_;
+
+ public:
+  PolicyMapLayer(BaseLayer<DataType>* ip, int C, int H, int W, int usedSize,
+                 bool attention, sycl::queue &sycl_queue);
+  ~PolicyMapLayer();
+
+  void LoadWeights(const short* cpuWeight, void* scratch);
+
+  void Eval(int N, DataType* output, const DataType* input,
+            const DataType* input2, void* scratch, size_t scratch_size, sycl::queue &sycl_queue,
+            DataType*** = nullptr) override;
+
+ private:
+  int used_size_;  // Size of the input without padding (typically 73x64).
+                   // This is over-written to contain size with padding
+                   // (typically 80x64) after CHW->HWC conversion for fp16.
+  const bool attention_map_;
+  short* weights_ = nullptr;
+};
+
+// Fused SE layer:
+// (optional bias add +) global avg -> FC1 -> FC2 -> global scale -> add skip
+// connection -> RELU.
+template <typename DataType>
+class SELayer : public BaseLayer<DataType> {
+  using BaseLayer<DataType>::C;
+  using BaseLayer<DataType>::nhwc_;
+  using BaseLayer<DataType>::sycl_queue_;
+
+ public:
+  SELayer(BaseLayer<DataType>* ip, int numFc1Out, bool addPrevLayerBias,
+          ActivationFunction activation, sycl::queue &sycl_queue);
+  ~SELayer();
+
+  void LoadWeights(float* w1, float* b1, float* w2, float* b2,
+                   float* prevLayerBias, void* scratch);
+
+  void Eval(int N, DataType* output, const DataType* input,
+            const DataType* input2, void* scratch, size_t scratch_size,
+            sycl::queue &sycl_queue, DataType*** = nullptr) override;
+
+ private:
+  DataType* w1_ = nullptr;
+  DataType* w1_t_ = nullptr;  // transposed copy used by fused SE kernel
+  DataType* b1_ = nullptr;
+  DataType* w2_ = nullptr;
+  DataType* w2_t_ = nullptr;
+  DataType* b2_ = nullptr;
+  DataType* bPrev_ = nullptr;
+  int numFc1Out_;
+  bool addPrevLayerBias_;
+  const ActivationFunction act_;
+};
+
+// Multi-pass Winograd Conv fused with (optional) SE
+template <typename DataType>
+class FusedWinogradConvSELayer : public BaseLayer<DataType> {
+  using BaseLayer<DataType>::C;
+  using BaseLayer<DataType>::H;
+  using BaseLayer<DataType>::W;
+  using BaseLayer<DataType>::GetC;
+  using BaseLayer<DataType>::GetH;
+  using BaseLayer<DataType>::GetW;
+  using BaseLayer<DataType>::nhwc_;
+  using BaseLayer<DataType>::sycl_queue_;
+
+ public:
+  FusedWinogradConvSELayer(BaseLayer<DataType>* ip, int C, int H, int W,
+                           int Cin, ActivationFunction activation, bool bias,
+                           bool skipAdd, bool se, int se_k, 
+                           sycl::queue &sycl_queue, bool op_nhcw = false);
+
+  ~FusedWinogradConvSELayer();
+  void LoadWeights(float* pfilter, float* pBias, void* scratch);
+  void LoadSEWeights(float* w1, float* b1, float* w2, float* b2, void* scratch);
+  void Eval(int N, DataType* output, const DataType* input,
+            const DataType* input2, void* scratch, size_t scratch_size,
+            sycl::queue &sycl_queue, DataType*** = nullptr) override;
+
+ private:
+  const int c_input_;
+  const ActivationFunction act_;
+  const bool use_bias_;
+  const bool skip_add_;
+  const bool has_se_;
+  const int se_k_;
+  const bool op_nhcw_;
+
+  DataType* biases_ = nullptr;
+  DataType* transformed_weights_ = nullptr;  // After winograd transform.
+
+  // Weights and Biases for (optional) SE.
+  DataType* w1_;
+  DataType* w2_;
+  DataType* b1_;
+  DataType* b2_;
+};
+
+template <typename DataType>
+class Conv1Layer : public BaseLayer<DataType> {
+  using BaseLayer<DataType>::C;
+  using BaseLayer<DataType>::H;
+  using BaseLayer<DataType>::W;
+  using BaseLayer<DataType>::GetC;
+  using BaseLayer<DataType>::GetH;
+  using BaseLayer<DataType>::GetW;
+  using BaseLayer<DataType>::nhwc_;
+  using BaseLayer<DataType>::sycl_queue_;
+
+ public:
+  Conv1Layer(BaseLayer<DataType>* ip, int C, int H, int W, int Cin,
+             ActivationFunction activation, bool bias, sycl::queue &sycl_queue);
+
+  ~Conv1Layer();
+  void LoadWeights(float* pfilter, float* pBias, void* scratch);
+  void Eval(int N, DataType* output, const DataType* input,
+            const DataType* input2, void* scratch, size_t scratch_size,
+            sycl::queue &sycl_queue, DataType*** = nullptr) override;
+
+ private:
+  const int c_input_;
+  const ActivationFunction act_;
+  const bool use_bias_;
+
+  DataType* biases_ = nullptr;
+  DataType* weights_ = nullptr;
+
+  // uses stride of 0 to read a vector as a matrix
+   void cublasSpecialMatrixMul(const DataType* A, const DataType* B,
+                              DataType* Out, int M, int N, int K, int batchSize, sycl::queue &sycl_queue);
+};
+
+// Multi-pass Winograd Conv fused with (optional) SE
+template <typename DataType>
+class ResidualBlock : public BaseLayer<DataType> {
+  using BaseLayer<DataType>::C;
+  using BaseLayer<DataType>::H;
+  using BaseLayer<DataType>::W;
+  using BaseLayer<DataType>::GetC;
+  using BaseLayer<DataType>::GetH;
+  using BaseLayer<DataType>::GetW;
+  using BaseLayer<DataType>::sycl_queue_;
+
+ public:
+  ResidualBlock(BaseLayer<DataType>* ip, int C, bool se, int se_k,
+                bool first, bool last,
+                ActivationFunction activation, int shared_mem_size, sycl::queue &sycl_queue);
+
+  ~ResidualBlock();
+  void LoadWeights0(float* pfilter, float* pBias, void* scratch);
+  void LoadWeights1(float* pfilter, float* pBias, void* scratch);
+  void LoadSEWeights(float* w1, float* b1, float* w2, float* b2, void* scratch);
+
+  void Eval(int N, DataType* output, const DataType* input,
+            const DataType* input2, void* scratch, size_t scratch_size,
+            sycl::queue &sycl_queue, DataType*** = nullptr) override;
+
+ private:
+  const bool has_se_;
+  const int se_k_;
+  const int c_input_;
+  const bool first_block_;
+  const bool last_block_;
+  const int shared_mem_size_;
+  const ActivationFunction act_;
+
+  DataType* biases0_ = nullptr;
+  DataType* biases1_ = nullptr;
+  DataType* transformed_weights0_ = nullptr;  // After winograd transform.
+  DataType* transformed_weights1_ = nullptr;  // After winograd transform.
+
+  // Weights and Biases for (optional) SE.
+  DataType* w1_;
+  DataType* w2_;
+  DataType* b1_;
+  DataType* b2_;
+};
+
+template <typename DataType>
+class EncoderBlock {
+ public:
+  EncoderBlock(const MultiHeadWeights::EncoderLayer& cpu_weights, void* scratch,
+               int heads, int size, float alpha,
+               DataType* smolgen_global_scratch, int smolgen_global_size,
+               int max_batch_size, ActivationFunction smolgen_act,
+               ActivationFunction ffn_act, float default_eps, sycl::queue &sycl_queue);
+  ~EncoderBlock();
+
+  void Eval(int N, DataType* inpop, DataType* scratch0, DataType* scratch1,
+            DataType* scratch2, sycl::queue &sycl_queue,
+            DataType*** offset_pointers);
+
+  // all GPU side pointers
+  DataType *mha_q_w, *mha_q_b;
+  DataType *mha_k_w, *mha_k_b;
+  DataType *mha_v_w, *mha_v_b;
+  DataType *mha_qkv_w, *mha_qkv_b;
+  DataType *mha_dense_w, *mha_dense_b;
+
+  DataType *ln1_gammas, *ln1_betas;
+
+  DataType *ffn_dense1_w, *ffn_dense1_b;
+  DataType *ffn_dense2_w, *ffn_dense2_b;
+
+  DataType *ln2_gammas, *ln2_betas;
+
+  DataType *smol_compress;
+  DataType *smol_dense1_w, *smol_dense1_b;
+  DataType *smol_dense2_w, *smol_dense2_b;
+  DataType *smol_ln1_gammas, *smol_ln1_betas;
+  DataType *smol_ln2_gammas, *smol_ln2_betas;
+  DataType *smol_global;
+
+  int mha_q_size_;
+  int mha_k_size_;
+  int mha_v_size_;
+  int mha_dense_size_;
+
+  int ffn_dense1_size_;
+  int ffn_dense2_size_;
+
+  int embedding_op_size_;
+  int encoder_heads_;
+
+  float alpha_;  // scale to apply to skip connection add
+  float default_eps_;  // value of epsilon where it wasn't specified in training
+
+  const bool has_smolgen_;
+  const ActivationFunction smolgen_activation_;
+  const ActivationFunction ffn_activation_;
+
+  // Output sizes for smolgen layers.
+  int smol_compress_size_;
+  int smol_dense_1_size_;
+  int smol_dense_2_size_;
+  int smol_global_size_;
+
+  const int max_batch_size_;
+
+  sycl::queue sycl_queue_;
+};
+
+// The Attention policy head implementation
+// Responsible for loading weights into GPU memory, and evaluating the entire
+// policy head
+template <typename DataType>
+class AttentionPolicyHead : public BaseLayer<DataType> {
+  using BaseLayer<DataType>::C;
+  using BaseLayer<DataType>::H;
+  using BaseLayer<DataType>::W;
+  using BaseLayer<DataType>::GetC;
+  using BaseLayer<DataType>::GetH;
+  using BaseLayer<DataType>::GetW;
+  using BaseLayer<DataType>::sycl_queue_;
+
+ public:
+  AttentionPolicyHead(BaseLayer<DataType>* ip,
+                      const MultiHeadWeights::PolicyHead& weights,
+                      void* scratch, bool attention_body,
+                      ActivationFunction act, int max_batch_size, sycl::queue &sycl_queue);
+
+  ~AttentionPolicyHead();
+  void Eval(int N, DataType* output, const DataType* input,
+            const DataType* input2, void* scratch, size_t scratch_size,
+            sycl::queue &sycl_queue, DataType*** = nullptr) override;
+
+ private:
+  // GPU allocations to hold various weights used by the attention policy head
+  DataType *ip_pol_w_, *ip_pol_b_;    // "embedding" in policy attention
+  DataType *ip2_pol_w_, *ip2_pol_b_;  // "wq" in policy attention
+  DataType *ip3_pol_w_, *ip3_pol_b_;  // "wk" in policy attention
+  DataType *ip4_pol_w_;               // "ppo" in policy attention
+
+  DataType *wqk_w_, *wqk_b_;  // allocation containing both "wq" and "wq"
+
+  int embedding_op_size_;
+  int wq_op_size_;
+  int wk_op_size_;
+
+  int encoder_heads_;
+  int policy_d_model_;
+  bool attention_body_;
+  ActivationFunction act_;
+
+  std::vector<EncoderBlock<DataType>*> encoder_weights_;
+};
+
+template <typename DataType>
+class EmbeddingLayer : public BaseLayer<DataType> {
+  using BaseLayer<DataType>::C;
+  using BaseLayer<DataType>::H;
+  using BaseLayer<DataType>::W;
+  using BaseLayer<DataType>::sycl_queue_;
+
+ public:
+  EmbeddingLayer(BaseLayer<DataType>* ip, const std::vector<float>& weights,
+                 const std::vector<float>& biases, void* scratch,
+                 ActivationFunction activation, sycl::queue &sycl_queue);
+  ~EmbeddingLayer();
+
+  //void Eval(int N, DataType* output, const DataType* input,
+  //          const DataType* input2, void* scratch, size_t scratch_size,
+  //          cudnnHandle_t cudnn, dpct::queue_ptr cublas, dpct::queue_ptr stream,
+  //          DataType*** = nullptr) override;
+
+  void Eval(int N, DataType* output, const DataType* input,
+            const DataType* input2, void* scratch, size_t scratch_size,
+            sycl::queue &sycl_queue, DataType*** = nullptr) override;
+
+ private:
+  DataType *weights_, *biases_;
+  ActivationFunction act_;
+};
+
+// The Attention body implementation
+// Responsible for loading weights into GPU memory, and evaluating the entire
+// attention network part of the body including the stack of encoder layers
+template <typename DataType>
+class AttentionBody : public BaseLayer<DataType> {
+  using BaseLayer<DataType>::C;
+  using BaseLayer<DataType>::H;
+  using BaseLayer<DataType>::W;
+  using BaseLayer<DataType>::GetC;
+  using BaseLayer<DataType>::GetH;
+  using BaseLayer<DataType>::GetW;
+  using BaseLayer<DataType>::sycl_queue_;
+
+ public:
+  AttentionBody(const MultiHeadWeights& weights, void* scratch,
+                Activations activations, int num_res_blocks, int input_c,
+                int max_batch_size, bool is_pe_dense_embedding,
+                sycl::queue &sycl_queue);
+  ~AttentionBody();
+  //void Eval(int N, DataType* output, const DataType* input,
+  //          const DataType* input2, void* scratch, size_t scratch_size,
+  //          cudnnHandle_t cudnn, dpct::queue_ptr cublas, dpct::queue_ptr stream,
+  //          DataType*** = nullptr) override;
+
+  void Eval(int N, DataType* output, const DataType* input,
+            const DataType* input2, void* scratch, size_t scratch_size,
+            sycl::queue &sycl_queue, DataType*** = nullptr) override;
+
+ private:
+  // GPU allocations to hold various weights used by the attention net body.
+  DataType *ip_emb_pre_w_, *ip_emb_pre_b_;  // input position preprocessing weights.
+  DataType *ip_emb_w_, *ip_emb_b_;          // "embedding" layer in net body
+  DataType *ip_emb_ln_g_, *ip_emb_ln_b_;  // input embedding layernorm gamma and beta
+  DataType *ip_mult_gate_, *ip_add_gate_;   // input gating
+  DataType *ip_emb_ffn_d1_w_, *ip_emb_ffn_d1_b_;  // input embedding FFN dense1 weights
+  DataType *ip_emb_ffn_d2_w_, *ip_emb_ffn_d2_b_;  // input embedding FFN dense2 weights
+  DataType *ip_emb_ffn_ln_g_, *ip_emb_ffn_ln_b_;  // input embedding FFN layernorm gamma and beta
+  DataType *smolgen_global_;  // global smolgen weights for all encoder layers
+  bool is_pe_dense_embedding_;  // flag for dense position encoding
+  DataType *pos_encoding_;
+  int embedding_dense_size_;
+  int embedding_op_size_;
+  int embedding_ffn_size_;
+  int embedding_ffn_dff_;
+  int encoder_head_count_;
+  std::vector<EncoderBlock<DataType>*> encoder_weights_;
+  Activations activations_;
+  int num_resi_blocks_;
+  int input_c_;
+  int smolgen_global_size_;
+  const bool has_gating_;
+  const bool has_smolgen_;
+};
+
+// The value head implementation
+// Responsible for loading weights into GPU memory, and evaluating the value
+// head and value error head
+template <typename DataType>
+class ValueHead : public BaseLayer<DataType> {
+  using BaseLayer<DataType>::C;
+  using BaseLayer<DataType>::H;
+  using BaseLayer<DataType>::W;
+  using BaseLayer<DataType>::GetC;
+  using BaseLayer<DataType>::GetH;
+  using BaseLayer<DataType>::GetW;
+  using BaseLayer<DataType>::sycl_queue_;
+
+ public:
+  ValueHead(BaseLayer<DataType>* ip, const MultiHeadWeights::ValueHead& weights,
+            void* scratch, bool attention_body, bool wdl, ActivationFunction act,
+            int max_batch_size, sycl::queue &sycl_queue);
+  ~ValueHead();
+  void Eval(int N, DataType* output, const DataType* input,
+            const DataType* input2, void* scratch, size_t scratch_size,
+            sycl::queue &sycl_queue, DataType*** = nullptr) override;
+
+ private:
+  // "convolution" in value head (legacy)
+  std::unique_ptr<Conv1Layer<DataType>> conv_;
+
+  // GPU allocations to hold various weights used by the attention policy head
+  DataType *ip_val_w_, *ip_val_b_;          // "embedding" in value head
+  DataType *ip1_val_w_, *ip1_val_b_;        // "FC1" in value head
+  DataType *ip2_val_w_, *ip2_val_b_;        // "FC2" in value head
+  DataType *ip_val_err_w_, *ip_val_err_b_;  // value error "FC" weights
+
+  int embedding_size_;
+  int value_hidden_size_;
+  bool wdl_;
+  bool attention_body_;
+  ActivationFunction act_;
+};
+
+
+}  // namespace sycldnn_backend
+}  // namespace lczero
diff --git a/src/neural/backends/sycl/network_sycl.cc.dp.cpp b/src/neural/backends/sycl/network_sycl.cc.dp.cpp
new file mode 100644
index 0000000000..11683c8aae
--- /dev/null
+++ b/src/neural/backends/sycl/network_sycl.cc.dp.cpp
@@ -0,0 +1,1161 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2018-2024 The LCZero Authors
+  Copyright (C) 2023 Intel Corporation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+   
+  SPDX-License-Identifier:GNU General Public License v3.0 or later
+*/
+
+#define DPCT_COMPAT_RT_VERSION 12020
+
+#include <sycl/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <functional>
+#include <list>
+#include <memory>
+#include <mutex>
+
+#include "sycl_common.h"
+#include "inputs_outputs.h"
+#include "kernels.h"
+#include "layers.h"
+#include "neural/backends/shared/activation.h"
+#include "neural/factory.h"
+#include "neural/network_legacy.h"
+#include "neural/tables/attention_policy_map.h"
+#include "neural/tables/policy_map.h"
+#include "utils/bititer.h"
+#include "utils/exception.h"
+#include <cmath>
+
+namespace lczero {
+using namespace sycldnn_backend;
+
+template <typename DataType>
+class SyclNetwork;
+
+static size_t getMaxAttentionHeadSize(
+    const MultiHeadWeights::PolicyHead& weights, int N) {
+  const size_t embedding_op_size = weights.ip_pol_b.size();
+  const size_t policy_d_model = weights.ip2_pol_b.size();
+  assert(policy_d_model == weights.ip3_pol_b.size());
+
+  size_t encoder_d_model = 0;
+  size_t encoder_dff = 0;
+
+  if (weights.pol_encoder.size() > 0) {
+    encoder_d_model = weights.pol_encoder[0].mha.q_b.size();
+    encoder_dff = weights.pol_encoder[0].ffn.dense1_b.size();
+
+    assert(encoder_d_model == weights.pol_encoder[0].mha.k_b.size());
+    assert(encoder_d_model == weights.pol_encoder[0].mha.v_b.size());
+    assert(embedding_op_size == weights.pol_encoder[0].ffn.dense2_b.size());
+  }
+
+  const size_t encoder_heads = weights.pol_encoder_head_count;
+
+  size_t size =
+      N * 64 *
+      std::max(std::max(embedding_op_size, encoder_dff), policy_d_model);
+
+  // size of matmul_qk matrix = encoder_heads_ * Batch * 64 * 64
+  const size_t matmul_qk_size = encoder_heads * N * 64 * 64;
+  const size_t output_size = N * (64 * 64 + 8 * 24);
+  size = std::max(size, std::max(matmul_qk_size, output_size));
+
+  size_t qkv_size = N * 64 * encoder_d_model;
+  // We store qkv in single allocation, and other intermediate tensors are
+  // sometimes stored by splitting an allocation into two halves.
+  size = std::max(2 * size, 3 * qkv_size);
+  return size;
+}
+
+static size_t getMaxAttentionBodySize(const MultiHeadWeights& weights, int N) {
+  const size_t embedding_op_size = weights.ip_emb_b.size();
+
+  size_t encoder_d_model = 0;
+  size_t encoder_dff = 0;
+
+  if (weights.encoder.size() > 0) {
+    encoder_d_model = weights.encoder[0].mha.q_b.size();
+    encoder_dff = weights.encoder[0].ffn.dense1_b.size();
+
+    assert(encoder_d_model == weights.encoder[0].mha.k_b.size());
+    assert(encoder_d_model == weights.encoder[0].mha.v_b.size());
+    assert(embedding_op_size == weights.encoder[0].ffn.dense2_b.size());
+  }
+
+  const size_t encoder_heads = weights.encoder_head_count;
+
+  size_t size =
+      N * 64 *
+      std::max(std::max(embedding_op_size, encoder_dff), encoder_d_model);
+
+  // size of matmul_qk matrix = encoder_heads_ * Batch * 64 * 64
+  const size_t matmul_qk_size = encoder_heads * N * 64 * 64;
+  const size_t output_size = N * (64 * 64 + 8 * 24);
+  size = std::max(size, std::max(matmul_qk_size, output_size));
+
+  size_t qkv_size = N * 64 * encoder_d_model;
+  // We store qkv in single allocation, and other intermediate tensors are
+  // sometimes stored by splitting an allocation into two halves.
+  size = std::max(2 * size, 3 * qkv_size);
+  return size;
+}
+
+template <typename DataType>
+class SyclNetworkComputation : public NetworkComputation {
+ public:
+  SyclNetworkComputation(SyclNetwork<DataType>* network, bool wdl,
+                         bool moves_left);
+  ~SyclNetworkComputation();
+
+  void AddInput(InputPlanes&& input) override {
+    const auto iter_mask =
+        &inputs_outputs_->input_masks_mem_shared_[batch_size_ * kInputPlanes];
+    const auto iter_val =
+        &inputs_outputs_->input_val_mem_shared_[batch_size_ * kInputPlanes];
+
+    int i = 0;
+    for (const auto& plane : input) {
+      iter_mask[i] = plane.mask;
+      iter_val[i] = plane.value;
+      i++;
+    }
+
+    batch_size_++;
+  }
+
+  void ComputeBlocking() override;
+
+  int GetBatchSize() const override { return batch_size_; }
+
+  float GetQVal(int sample) const override {
+    if (wdl_) {
+      auto w = inputs_outputs_->op_value_mem_shared_[3 * sample + 0];
+      auto l = inputs_outputs_->op_value_mem_shared_[3 * sample + 2];
+      return w - l;
+    }
+    return inputs_outputs_->op_value_mem_shared_[sample];
+  }
+
+  float GetDVal(int sample) const override {
+    if (wdl_) {
+      auto d = inputs_outputs_->op_value_mem_shared_[3 * sample + 1];
+      return d;
+    }
+    return 0.0f;
+  }
+
+  float GetPVal(int sample, int move_id) const override {
+    return inputs_outputs_->op_policy_mem_[sample * kNumOutputPolicy + move_id];
+  }
+
+  float GetMVal(int sample) const override {
+    if (moves_left_) {
+      return inputs_outputs_->op_moves_left_mem_shared_[sample];
+    }
+    return 0.0f;
+  }
+
+ private:
+  // Memory holding inputs, outputs.
+  std::unique_ptr<InputsOutputs> inputs_outputs_;
+  int batch_size_;
+  bool wdl_;
+  bool moves_left_;
+
+  SyclNetwork<DataType>* network_;
+};
+
+template <typename DataType>
+class SyclNetwork : public Network {
+ public:
+  SyclNetwork(const WeightsFile& file, const OptionsDict& options)
+      : capabilities_{file.format().network_format().input(),
+                      file.format().network_format().output(),
+                      file.format().network_format().moves_left()} {
+    MultiHeadWeights weights(file.weights());
+    gpu_id_ = options.GetOrDefault<int>("gpu", 0);
+
+    const auto nf = file.format().network_format();
+    using NF = pblczero::NetworkFormat;
+    conv_policy_ = nf.policy() == NF::POLICY_CONVOLUTION;
+    attn_policy_ = nf.policy() == NF::POLICY_ATTENTION;
+    attn_body_ = nf.network() == NF::NETWORK_ATTENTIONBODY_WITH_HEADFORMAT ||
+                 nf.network() == NF::NETWORK_ATTENTIONBODY_WITH_MULTIHEADFORMAT;
+
+    max_batch_size_ = options.GetOrDefault<int>("max_batch", 1024);
+
+    // Get all available platforms
+    auto platforms = sycl::platform::get_platforms();
+    
+    if (platforms.empty()) {
+      throw Exception("No SYCL platform found.");
+    }
+    showPlatformInfo(platforms);
+    
+    // A vector to store all sycl devices.
+    std::vector<sycl::device> devices;
+
+    for (const auto& platform : platforms) {
+       auto platform_devices = platform.get_devices();
+       devices.insert(devices.end(), platform_devices.begin(), platform_devices.end());
+    }
+
+    if (gpu_id_ >= (int)devices.size() || gpu_id_ < 0)
+      throw Exception("Invalid GPU Id: " + std::to_string(gpu_id_));
+    
+    // Is it a cpu device?
+    is_cpu_ = devices[gpu_id_].is_cpu();
+    // Get the number of compute units(execution units).
+    compute_units_ = devices[gpu_id_].get_info<sycl::info::device::max_compute_units>();
+    // Get context.
+    sycl::context context{devices[gpu_id_]};
+    auto exceptions_handler = [&] (sycl::exception_list exceptions) {
+        for (std::exception_ptr const& e : exceptions) {
+           try {
+               std::rethrow_exception(e);
+            } catch(sycl::exception const& e) {
+				CERR 
+                << "Caught asynchronous SYCL exception during GEMM:\n"
+                << e.what() 
+                << "\n ";
+                std::terminate();
+            }
+        }
+    };
+    
+    sycl_queue_ = new sycl::queue{context, devices[gpu_id_], 
+              exceptions_handler, sycl::property_list{sycl::property::queue::in_order{}} };
+
+    showDeviceInfo(*sycl_queue_);
+
+    l2_cache_size_ =  sycl_queue_->get_device().get_info<sycl::info::device::local_mem_size>();
+
+    allow_cache_opt_ = options.GetOrDefault<bool>("cache_opt", false);
+
+    // Select GPU to run on (for *the current* thread).
+    multi_stream_ = options.GetOrDefault<bool>("multi_stream", false);
+
+    // layout used by cuda backend is nchw.
+    has_tensor_cores_ = false;
+    constexpr bool fp16 = std::is_same<sycl::half, DataType>::value;
+
+    //dpct::device_info deviceProp = {};
+    //sycl_queue_->get_device().get_device_info(deviceProp);
+
+
+    if (fp16) {
+      if (!sycl_queue_->get_device().has(sycl::aspect::fp16)) {
+        throw Exception("Requested fp16 is not supported by the device");
+      }
+      CERR << "Using Fp16 "; 
+    } else {
+      CERR << "Using Fp32 ";
+    }
+
+    const int kNumInputPlanes = kInputPlanes;
+    const int kNumFilters = (int)weights.input.biases.size();
+    numBlocks_ = (int)weights.residual.size();
+    numFilters_ = kNumFilters;
+
+    num_encoder_blocks_ = (int)weights.encoder.size();
+    if (attn_body_) {
+      assert(weights.ip_emb_b.size() > 0);
+    }
+
+    // Warn if the memory required for storing transformed weights is
+    // going to exceed 40% of total video memory, force custom_winograd off
+    // if it's going to exceed 50% of memory.
+    size_t residual_single_layer_weight_size =
+        3 * 3 * kNumFilters * kNumFilters * sizeof(DataType);
+    size_t residual_weight_size =
+        residual_single_layer_weight_size * numBlocks_ * 2;
+    size_t transformed_residual_weight_size = residual_weight_size * 4;
+
+    size_t global_mem_size = sycl_queue_->get_device().get_info<sycl::info::device::max_mem_alloc_size>();
+
+    if (transformed_residual_weight_size > 0.4 * global_mem_size) {
+      CERR << "WARNING: Low GPU video memory. You may run into OOM errors. Try "
+              "using a smaller network.";
+    }
+
+    // Disable res block fusing for fp32 for now (not worth it)
+    // TODO: make it work for filters not a multiple of 32.
+    // Note that when used with SE, the optimization
+    // works only when filter count is <= 384 (pre-Ampere), or less than 512
+    // (Ampere)
+    // It turns dynamically off based on filter count (see
+    // ResidualBlock<DataType>::Eval)
+    // TODO: fix res_block_fusing.
+    if (kNumFilters % 32 == 0 && std::is_same<sycl::half, DataType>::value) {
+      use_res_block_winograd_fuse_opt_ = false;
+    } else {
+      use_res_block_winograd_fuse_opt_ = false;
+    }
+    // Override if set in backend-opts.
+#if  0
+    if (options.Exists<bool>("res_block_fusing")) {
+      use_res_block_winograd_fuse_opt_ = options.Get<bool>("res_block_fusing");
+    }
+#endif
+    /*
+    DPCT1005:86: The SYCL device version is different from CUDA Compute
+    Compatibility. You may need to rewrite this code.
+    */
+
+    // 0. Check for SE.
+    has_se_ = false;
+    if (numBlocks_ && weights.residual[0].has_se) {
+      has_se_ = true;
+    }
+
+    // Have some minumum as we also use this for transforming weights.
+    size_t max_weight_size = 128 * 1024 * 1024;
+
+    // parts from scratch allocation are suballocated to hold various weights
+    // and biases when transforming winograd weights (one layer at a time), 128
+    // MB is way more than that what we need but make sure it's at least 3x of
+    // single layer's weight size to be safe.
+    if (max_weight_size < 3 * residual_single_layer_weight_size)
+        max_weight_size = 3 * residual_single_layer_weight_size;
+
+     scratch_size_ = max_weight_size;
+
+    // times size (4x4 block transformed into 6x6).
+    if (numBlocks_ > 0) {
+      const size_t transformed_tensor_size =
+          (size_t)(max_batch_size_ * kNumFilters * 64 * (36.0 / 16.0) *
+                   sizeof(DataType));
+      scratch_size_ = std::max(scratch_size_, 2 * transformed_tensor_size);
+    }
+
+    std::string policy_head =
+        options.GetOrDefault<std::string>("policy_head", "vanilla");
+    // Check that selected policy head exists.
+    if (weights.policy_heads.count(policy_head) == 0) {
+      throw Exception("The policy head you specified '" + policy_head +
+                      "' does not exist in this net.");
+    }
+    std::string value_head =
+        options.GetOrDefault<std::string>("value_head", "winner");
+    // Check that selected value head exists.
+    if (weights.value_heads.count(value_head) == 0) {
+      throw Exception("The value head you specified '" + value_head +
+                      "' does not exist in this net.");
+    }
+
+    // Attention policy head or body may need more memory
+    const size_t attentionPolicySize =
+        getMaxAttentionHeadSize(weights.policy_heads.at(policy_head),
+                                max_batch_size_) *
+        sizeof(DataType);
+
+    const size_t attentionBodySize =
+        getMaxAttentionBodySize(weights, max_batch_size_) * sizeof(DataType);
+    scratch_size_ = std::max(scratch_size_,
+                             std::max(attentionPolicySize, attentionBodySize));
+
+    scratch_mem_ = (void*)sycl::malloc_device(scratch_size_, *sycl_queue_);
+
+    const bool mish_net = file.format().network_format().default_activation() ==
+                          pblczero::NetworkFormat::DEFAULT_ACTIVATION_MISH;
+
+    ActivationFunction act = mish_net ? ACTIVATION_MISH : ACTIVATION_RELU;
+
+    // 2. Build the network, and copy the weights to GPU memory.
+
+    // Input conv only used if there are residual blocks in the network
+    if (numBlocks_ > 0) {
+      // Input.
+      {
+        auto inputConv = std::make_unique<FusedWinogradConvSELayer<DataType>>(
+            nullptr, kNumFilters, 8, 8, kNumInputPlanes, act, true, false,
+            false, 0,  *sycl_queue_, use_res_block_winograd_fuse_opt_);
+
+        inputConv->LoadWeights(&weights.input.weights[0],
+                               &weights.input.biases[0], scratch_mem_);
+        network_.emplace_back(std::move(inputConv));
+      }
+
+      // Residual block.
+      for (int block = 0; block < numBlocks_; block++) {
+        bool has_se = weights.residual[block].has_se;
+        int se_k = (int)weights.residual[block].se.b1.size();
+
+        /*   
+        if (use_res_block_winograd_fuse_opt_) {
+          auto layer = std::make_unique<ResidualBlock<DataType>>(
+              getLastLayer(), kNumFilters, has_se, se_k,
+              block == 0, block == (numBlocks_ - 1), act,
+              deviceProp.sharedMemPerBlockOptin);
+          layer->LoadWeights0(&weights.residual[block].conv1.weights[0],
+                              &weights.residual[block].conv1.biases[0],
+                              scratch_mem_);
+          layer->LoadWeights1(&weights.residual[block].conv2.weights[0],
+                              &weights.residual[block].conv2.biases[0],
+                              scratch_mem_);
+          if (has_se)
+            layer->LoadSEWeights(&weights.residual[block].se.w1[0],
+                                 &weights.residual[block].se.b1[0],
+                                 &weights.residual[block].se.w2[0],
+                                 &weights.residual[block].se.b2[0],
+                                 scratch_mem_);
+          network_.emplace_back(std::move(layer));
+        } else { */
+          auto conv1 = std::make_unique<FusedWinogradConvSELayer<DataType>>(
+              getLastLayer(), kNumFilters, 8, 8, kNumFilters, act, true, false,
+              false, 0, *sycl_queue_);
+
+          conv1->LoadWeights(&weights.residual[block].conv1.weights[0],
+                             &weights.residual[block].conv1.biases[0],
+                             scratch_mem_);
+          network_.emplace_back(std::move(conv1));
+
+          auto conv2 = std::make_unique<FusedWinogradConvSELayer<DataType>>(
+              getLastLayer(), kNumFilters, 8, 8, kNumFilters, act, true, true,
+              has_se, se_k, *sycl_queue_);
+          conv2->LoadWeights(&weights.residual[block].conv2.weights[0],
+                             &weights.residual[block].conv2.biases[0],
+                             scratch_mem_);
+          if (has_se)
+            conv2->LoadSEWeights(&weights.residual[block].se.w1[0],
+                                 &weights.residual[block].se.b1[0],
+                                 &weights.residual[block].se.w2[0],
+                                 &weights.residual[block].se.b2[0],
+                                 scratch_mem_);
+          network_.emplace_back(std::move(conv2));
+        //}
+      }
+      resi_last_ = getLastLayer();
+    }
+
+    if (attn_body_) {
+      Activations activations;
+      const auto smolgen_activation =
+          file.format().network_format().smolgen_activation();
+      activations.smolgen_activation =
+          smolgen_activation == pblczero::NetworkFormat::ACTIVATION_DEFAULT
+              ? act
+              : static_cast<ActivationFunction>(smolgen_activation);
+      const auto ffn_activation =
+          file.format().network_format().ffn_activation();
+      activations.ffn_activation =
+          ffn_activation == pblczero::NetworkFormat::ACTIVATION_DEFAULT
+              ? act
+              : static_cast<ActivationFunction>(ffn_activation);
+      activations.default_activation = act;
+
+      auto attention_body = std::make_unique<AttentionBody<DataType>>(
+          weights, scratch_mem_, activations, numBlocks_,
+          numBlocks_ > 0 ? kNumFilters : kInputPlanes, max_batch_size_,
+          static_cast<InputEmbedding>(
+              file.format().network_format().input_embedding()) ==
+              InputEmbedding::INPUT_EMBEDDING_PE_DENSE,
+          *sycl_queue_);
+      network_.emplace_back(std::move(attention_body));
+
+      encoder_last_ = getLastLayer();
+    }
+
+    // Policy head.
+    {
+      MultiHeadWeights::PolicyHead& head = weights.policy_heads.at(policy_head);
+      if (attn_policy_) {
+        auto AttentionPolicy = std::make_unique<AttentionPolicyHead<DataType>>(
+            getLastLayer(), head, scratch_mem_, attn_body_, act,
+            max_batch_size_, *sycl_queue_);
+        network_.emplace_back(std::move(AttentionPolicy));
+
+        auto policymap = std::make_unique<PolicyMapLayer<DataType>>(
+            getLastLayer(), kNumOutputPolicy, 1, 1, 64 * 64 + 8 * 24, true, *sycl_queue_);
+        policymap->LoadWeights(kAttnPolicyMap, scratch_mem_);
+        network_.emplace_back(std::move(policymap));
+
+      } else {
+        if (conv_policy_) {
+          assert(!attn_body_);  // not supported with attention body
+          auto conv1 = std::make_unique<FusedWinogradConvSELayer<DataType>>(
+              resi_last_, kNumFilters, 8, 8, kNumFilters, act, true, false,
+              false, 0, *sycl_queue_);
+          conv1->LoadWeights(&head.policy1.weights[0], &head.policy1.biases[0],
+                             scratch_mem_);
+          network_.emplace_back(std::move(conv1));
+
+          auto pol_channels = head.policy.biases.size();
+
+          // No relu
+          auto conv2 = std::make_unique<FusedWinogradConvSELayer<DataType>>(
+              getLastLayer(), pol_channels, 8, 8, kNumFilters, ACTIVATION_NONE,
+              true, false, false, 0, *sycl_queue_);
+          conv2->LoadWeights(&head.policy.weights[0], &head.policy.biases[0],
+                             scratch_mem_);
+          network_.emplace_back(std::move(conv2));
+
+          auto policymap = std::make_unique<PolicyMapLayer<DataType>>(
+              getLastLayer(), kNumOutputPolicy, 1, 1, 73 * 8 * 8, false, *sycl_queue_);
+          policymap->LoadWeights(kConvPolicyMap, scratch_mem_);
+
+          network_.emplace_back(std::move(policymap));
+        } else {
+          assert(!attn_body_);  // not supported with attention body
+          auto convPol = std::make_unique<Conv1Layer<DataType>>(
+              resi_last_, head.policy.biases.size(), 8, 8, kNumFilters, act,
+              true, *sycl_queue_);
+          convPol->LoadWeights(&head.policy.weights[0], &head.policy.biases[0],
+                               scratch_mem_);
+          network_.emplace_back(std::move(convPol));
+
+          auto FCPol = std::make_unique<FCLayer<DataType>>(
+              getLastLayer(), head.ip_pol_b.size(), 1, 1, true,
+              ACTIVATION_NONE, *sycl_queue_);
+          FCPol->LoadWeights(&head.ip_pol_w[0], &head.ip_pol_b[0],
+                             scratch_mem_);
+          network_.emplace_back(std::move(FCPol));
+        }
+      }
+    }
+
+    // Value heads.
+    {
+      const MultiHeadWeights::ValueHead& head =
+          weights.value_heads.at(value_head);
+      wdl_ = file.format().network_format().value() ==
+             pblczero::NetworkFormat::VALUE_WDL;
+
+      BaseLayer<DataType>* lastlayer = attn_body_ ? encoder_last_ : resi_last_;
+      auto value_main = std::make_unique<ValueHead<DataType>>(
+          lastlayer, head, scratch_mem_, attn_body_, wdl_, act,
+          max_batch_size_, *sycl_queue_);
+      network_.emplace_back(std::move(value_main));
+    }
+
+    // Moves left head
+    moves_left_ = (file.format().network_format().moves_left() ==
+                   pblczero::NetworkFormat::MOVES_LEFT_V1) &&
+                  options.GetOrDefault<bool>("mlh", true);
+    if (moves_left_) {
+      if (attn_body_) {
+        auto embedded_mov = std::make_unique<EmbeddingLayer<DataType>>(
+            encoder_last_, weights.ip_mov_w, weights.ip_mov_b, scratch_mem_,
+            act, *sycl_queue_);
+        network_.emplace_back(std::move(embedded_mov));
+      } else {
+        auto convMov = std::make_unique<Conv1Layer<DataType>>(
+            resi_last_, weights.moves_left.biases.size(), 8, 8, kNumFilters,
+            act, true, *sycl_queue_);
+        convMov->LoadWeights(&weights.moves_left.weights[0],
+                             &weights.moves_left.biases[0], scratch_mem_);
+        network_.emplace_back(std::move(convMov));
+      }
+      auto FCMov1 = std::make_unique<FCLayer<DataType>>(
+          getLastLayer(), weights.ip1_mov_b.size(), 1, 1, true, act, *sycl_queue_);
+      FCMov1->LoadWeights(&weights.ip1_mov_w[0], &weights.ip1_mov_b[0],
+                          scratch_mem_);
+      network_.emplace_back(std::move(FCMov1));
+
+      auto FCMov2 = std::make_unique<FCLayer<DataType>>(getLastLayer(), 1, 1, 1,
+                                                        true, ACTIVATION_RELU, *sycl_queue_);
+      FCMov2->LoadWeights(&weights.ip2_mov_w[0], &weights.ip2_mov_b[0],
+                          scratch_mem_);
+      network_.emplace_back(std::move(FCMov2));
+    }
+
+    // 3. Allocate GPU memory for running the network:
+    //    - three buffers of max size are enough (one to hold input, second to
+    //      hold output and third to hold skip connection's input).
+
+    // size of input to the network
+    size_t maxSize = max_batch_size_ * kNumInputPlanes * 64 * sizeof(DataType);
+
+    // take max size of all layers
+    for (auto& layer : network_) {
+      maxSize = std::max(maxSize, layer->GetOutputSize(max_batch_size_));
+    }
+
+    if ((attn_policy_ || use_res_block_winograd_fuse_opt_ || attn_body_) &&
+        (scratch_size_ > maxSize)) {
+      maxSize = scratch_size_;
+    }
+
+    if (!multi_stream_) {
+      for (auto& mem : tensor_mem_) {
+            //mem = (typename std::remove_reference<decltype(mem)>::type)
+            mem = (DataType *)sycl::malloc_device(maxSize, *sycl_queue_);
+            sycl_queue_->memset(mem, 0, maxSize).wait();
+      }
+    }
+
+    tensor_mem_size_ = multi_stream_ ? maxSize : 0;
+
+    // pre-allocate one InputsOutputs object
+    // The first call to allocate memory, create cublas,
+    // strem, etc takes really long (600 ms)
+    //CERR << "Creating Inputs Outputs. ";
+    std::unique_ptr<InputsOutputs> io = GetInputsOutputs();
+    //CERR << "Done loading network. ";
+  }
+
+  void forwardEval(InputsOutputs* io, int batchSize) {
+    
+    
+    if (!multi_stream_) lock_.lock();
+
+#ifdef DEBUG_RAW_NPS
+    auto t_start = std::chrono::high_resolution_clock::now();
+#endif
+
+    // Expand packed planes to full planes.
+    uint64_t* ipDataMasks = io->input_masks_mem_shared_;
+    float* ipDataValues = io->input_val_mem_shared_;
+    sycl::queue io_sycl_queue_ = io->q_ct1;
+
+    DataType* tensor_mem[3];
+    void* scratch_mem;
+    DataType*** offset_pointers;
+    DataType*** head_offset_pointers;
+
+    if (multi_stream_) {
+      
+      // We use tensor and scratch memory from InputOutputs (so that multiple
+      // requests can run in parallel)
+      for (int i = 0; i < 3; i++) tensor_mem[i] = (DataType*)io->tensor_mem_[i];
+      scratch_mem = io->scratch_mem_;
+      offset_pointers = (DataType***)&io->offset_pointers_;
+      head_offset_pointers = (DataType***)&io->head_offset_pointers_;
+      //stream = io->stream_;
+      //cublas = io->cublas_;
+    } else {
+      
+      for (int i = 0; i < 3; i++) tensor_mem[i] = tensor_mem_[i];
+      scratch_mem = scratch_mem_;
+      offset_pointers = (DataType***)&offset_pointers_;
+      head_offset_pointers = (DataType***)&head_offset_pointers_;
+      //stream = &dpct::get_default_queue();  // default stream
+      //cublas = cublas_;
+    }
+
+    
+    bool fp16 = std::is_same<sycl::half, DataType>::value;
+    if (fp16) {
+      expandPlanes_Fp16_NCHW((sycl::half*)(tensor_mem[0]), ipDataMasks, ipDataValues,
+                             batchSize * kInputPlanes, io_sycl_queue_);
+    } else {
+      expandPlanes_Fp32_NCHW((float*)(tensor_mem[0]), ipDataMasks, ipDataValues,
+                             batchSize * kInputPlanes, io_sycl_queue_);
+    }
+    
+
+    float* opPol = io->op_policy_mem_gpu_;
+    float* opVal = io->op_value_mem_shared_;
+    float* opMov = io->op_moves_left_mem_shared_;
+
+    
+
+    // Figure out if the memory requirment for running the res block would fit
+    // in the L2 cache.
+    bool enableCacheOpt = false;
+    DataType* skip_connection =
+        use_res_block_winograd_fuse_opt_ ? tensor_mem[1] : tensor_mem[2];
+
+    
+//#if DPCT_COMPAT_RT_VERSION >= 11000
+    const int pre_transform_tensor_size =
+        batchSize * numFilters_ * 8 * 8 * sizeof(DataType);
+    const int transformed_tensor_size = pre_transform_tensor_size * 36 / 16;
+    const int res_block_mem =
+        transformed_tensor_size * 2 + pre_transform_tensor_size;
+
+    //cudaStreamAttrValue stream_attribute = {};
+    //stream_attribute.accessPolicyWindow.base_ptr = tensor_mem[2];
+    //stream_attribute.accessPolicyWindow.num_bytes = res_block_mem;
+    //stream_attribute.accessPolicyWindow.hitRatio = 1.0f;
+    //stream_attribute.accessPolicyWindow.hitProp = cudaAccessPropertyPersisting;
+    //stream_attribute.accessPolicyWindow.missProp = cudaAccessPropertyStreaming;
+
+    //if (allow_cache_opt_ && use_res_block_winograd_fuse_opt_ &&
+    //    (res_block_mem <= scratch_size_) && (res_block_mem <= l2_cache_size_)) {
+      // we can use a single alloc to hold all the required tensors, and enable
+      // persistent L2 caching on it
+      /*
+      DPCT1007:87: Migration of cudaStreamSetAttribute is not supported.
+      */
+      //cudaStreamSetAttribute(stream, cudaStreamAttributeAccessPolicyWindow, &stream_attribute);
+
+     // enableCacheOpt = true;
+    //  skip_connection =
+    //      tensor_mem[2] + 2 * transformed_tensor_size / sizeof(DataType);
+   // }
+//#endif
+
+    int l = 0;
+
+    DataType* flow = tensor_mem[0];
+    DataType* spare1 = tensor_mem[1];
+    DataType* spare2 = tensor_mem[2];
+    
+
+    if (numBlocks_ > 0) {
+      // Input.
+      network_[l++]->Eval(batchSize, skip_connection, tensor_mem[0], nullptr,
+                          scratch_mem, scratch_size_, io_sycl_queue_, nullptr);  // input conv
+      
+
+      // Residual block.
+      for (int block = 0; block < numBlocks_; block++) {
+        if (use_res_block_winograd_fuse_opt_) {
+          network_[l++]->Eval(batchSize, tensor_mem[2], skip_connection,
+                              nullptr, enableCacheOpt ? nullptr : scratch_mem,
+                              scratch_size_, io_sycl_queue_, nullptr);  // block
+        } else {
+          network_[l++]->Eval(batchSize, tensor_mem[0], tensor_mem[2], nullptr,
+                              scratch_mem, scratch_size_, io_sycl_queue_, nullptr);  // conv1
+
+          network_[l++]->Eval(batchSize, tensor_mem[2], tensor_mem[0],
+                              tensor_mem[2], scratch_mem, scratch_size_,
+                              io_sycl_queue_, nullptr);  // conv2
+        }
+
+        
+      }
+
+      flow = tensor_mem[2];
+      spare1 = tensor_mem[0];
+      spare2 = tensor_mem[1];
+    }
+
+    
+    if (attn_body_) {
+      network_[l++]->Eval(
+          batchSize, tensor_mem[1],
+          (numBlocks_ > 0) ? tensor_mem[2] : tensor_mem[0],
+          (numBlocks_ > 0) ? tensor_mem[0] : tensor_mem[2], scratch_mem,
+          scratch_size_, io_sycl_queue_,
+          offset_pointers);  // Entire attention body of the network
+
+      flow = tensor_mem[1];
+      spare1 = tensor_mem[0];
+      spare2 = tensor_mem[2];
+    }
+
+    // Policy head.
+   
+    if (attn_policy_) {
+      
+      network_[l++]->Eval(
+          batchSize, spare1, flow, spare2, scratch_mem, scratch_size_, io_sycl_queue_,
+          head_offset_pointers);  // Entire Attention policy head except for the
+                                  // policy map
+      if (fp16) {
+        network_[l++]->Eval(batchSize, spare2, spare1, nullptr, scratch_mem,
+                            scratch_size_, io_sycl_queue_, nullptr);  // policy map layer
+
+
+        copyTypeConverted(opPol, (sycl::half*)spare2,
+                          batchSize * kNumOutputPolicy,
+                          io_sycl_queue_);  // POLICY output
+      } else {
+        network_[l++]->Eval(batchSize, (DataType*)opPol, spare1, nullptr,
+                            scratch_mem, scratch_size_, io_sycl_queue_, nullptr);  // policy map layer  // POLICY output
+        
+      }
+ 
+    } else if (conv_policy_) {
+
+      network_[l++]->Eval(batchSize, spare1, flow, nullptr, scratch_mem,
+                          scratch_size_, io_sycl_queue_, nullptr);  // policy conv1
+
+      network_[l++]->Eval(batchSize, spare2, spare1, nullptr, scratch_mem,
+                          scratch_size_, io_sycl_queue_, nullptr);  // policy conv2
+
+      if (fp16) {
+        network_[l++]->Eval(batchSize, spare1, spare2, nullptr, scratch_mem,
+                            scratch_size_, io_sycl_queue_, nullptr);  // policy map layer
+
+        copyTypeConverted(opPol, (sycl::half*)(spare1),
+                          batchSize * kNumOutputPolicy,
+                          io_sycl_queue_);  // POLICY output
+
+
+      } else {
+        network_[l++]->Eval(batchSize, (DataType*)opPol, spare2, nullptr,
+                            scratch_mem, scratch_size_, io_sycl_queue_, nullptr);  
+                            // policy map layer  // POLICY output
+      }
+
+    } else {
+      
+      network_[l++]->Eval(batchSize, spare1, flow, nullptr, scratch_mem,
+                          scratch_size_, io_sycl_queue_, nullptr);  // pol conv
+
+      if (fp16) {
+        network_[l++]->Eval(batchSize, spare2, spare1, nullptr, scratch_mem,
+                            scratch_size_, io_sycl_queue_, nullptr);  // pol FC
+
+        copyTypeConverted(opPol, (sycl::half*)(spare2),
+                          batchSize * kNumOutputPolicy,
+                          io_sycl_queue_);  // POLICY
+      } else {
+        network_[l++]->Eval(batchSize, (DataType*)opPol, spare1, nullptr,
+                            scratch_mem, scratch_size_, io_sycl_queue_, nullptr);  // pol FC  // POLICY
+      }
+    }
+
+
+    // value head
+    if (fp16) {
+      network_[l++]->Eval(batchSize, spare1, flow, spare2, scratch_mem,
+                          scratch_size_, io_sycl_queue_, nullptr);  // value head
+
+      copyTypeConverted(opVal, (sycl::half*)spare1, wdl_ ? 3 * batchSize : batchSize,
+                        io_sycl_queue_);
+    } else {
+      network_[l++]->Eval(batchSize, (DataType*)opVal, flow, spare2,
+                          scratch_mem, scratch_size_, io_sycl_queue_, nullptr);  // value head
+    }
+
+    if (moves_left_) {
+
+      // Moves left head
+      network_[l++]->Eval(batchSize, spare1, flow, nullptr, scratch_mem,
+                          scratch_size_, io_sycl_queue_, nullptr);  // moves conv or embedding
+
+      network_[l++]->Eval(batchSize, spare2, spare1, nullptr, scratch_mem,
+                          scratch_size_, io_sycl_queue_, nullptr);  // moves FC1
+
+      // Moves left FC2
+      if (fp16) {
+        // TODO: consider fusing the bias-add of FC2 with format conversion.
+        
+        
+        network_[l++]->Eval(batchSize, spare1, spare2, nullptr, scratch_mem,
+                            scratch_size_, io_sycl_queue_, nullptr);
+        
+
+        copyTypeConverted(opMov, (sycl::half*)(spare1), batchSize, io_sycl_queue_);
+      
+      } else {
+
+        network_[l++]->Eval(batchSize, (DataType*)opMov, spare2, nullptr,
+                            scratch_mem, scratch_size_, io_sycl_queue_, nullptr);
+
+      }
+    }
+    
+    // Copy policy output from device memory to host memory.
+    auto event = io_sycl_queue_.memcpy(io->op_policy_mem_, io->op_policy_mem_gpu_, sizeof(float) * kNumOutputPolicy * batchSize);
+
+    if (!multi_stream_) {
+      //ReportCUDAErrors(
+        //  DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw()));
+      // The next thread can start using the GPU now.
+      lock_.unlock();
+    }
+
+    event.wait();
+
+    if (wdl_) {
+      // Value softmax done cpu side.
+      for (int i = 0; i < batchSize; i++) {
+        float w = io->op_value_mem_shared_[3 * i + 0];
+        float d = io->op_value_mem_shared_[3 * i + 1];
+        float l = io->op_value_mem_shared_[3 * i + 2];
+        float m = std::max({w, d, l});
+        w = std::exp(w - m);
+        d = std::exp(d - m);
+        l = std::exp(l - m);
+        float sum = w + d + l;
+        w /= sum;
+        l /= sum;
+        d /= sum;
+        io->op_value_mem_shared_[3 * i + 0] = w;
+        io->op_value_mem_shared_[3 * i + 1] = d;
+        io->op_value_mem_shared_[3 * i + 2] = l;
+      }
+    }
+  }
+
+  ~SyclNetwork() {
+    if (scratch_mem_) 
+        sycl::free(scratch_mem_, *sycl_queue_);
+    if (!multi_stream_) {
+      for (auto mem : tensor_mem_) {
+        if (mem) 
+          sycl::free(mem, *sycl_queue_);
+      }
+      if (offset_pointers_) 
+          sycl::free(offset_pointers_, *sycl_queue_);
+      if (head_offset_pointers_)
+          sycl::free(head_offset_pointers_, *sycl_queue_);
+      //cublas_ = nullptr;
+    }
+  }
+
+  const NetworkCapabilities& GetCapabilities() const override {
+    return capabilities_;
+  }
+
+  // Check if device is the cpu for thread handling.
+  bool IsCpu() const override { return is_cpu_; }
+
+  int GetThreads() const override { return 1 + multi_stream_; }
+
+  int GetMiniBatchSize() const override {
+     if (is_cpu_) return 47;
+       // Simple heuristic that seems to work for a wide range of GPUs.
+       return 2 * compute_units_;
+    }
+  
+  std::unique_ptr<NetworkComputation> NewComputation() override {
+    return std::make_unique<SyclNetworkComputation<DataType>>(this, wdl_,
+                                                              moves_left_);
+  }
+
+  std::unique_ptr<InputsOutputs> GetInputsOutputs() {
+    std::lock_guard<std::mutex> lock(inputs_outputs_lock_);
+    if (free_inputs_outputs_.empty()) {
+      return std::make_unique<InputsOutputs>(
+          max_batch_size_, wdl_, moves_left_, *sycl_queue_, tensor_mem_size_, scratch_size_,
+          !has_tensor_cores_ && std::is_same<sycl::half, DataType>::value);
+    } else {
+      std::unique_ptr<InputsOutputs> resource =
+          std::move(free_inputs_outputs_.front());
+      free_inputs_outputs_.pop_front();
+      return resource;
+    }
+  }
+
+  void ReleaseInputsOutputs(std::unique_ptr<InputsOutputs> resource) {
+    std::lock_guard<std::mutex> lock(inputs_outputs_lock_);
+    free_inputs_outputs_.push_back(std::move(resource));
+  }
+
+
+ private:
+  const NetworkCapabilities capabilities_;
+  int gpu_id_;
+  int l2_cache_size_;
+  int max_batch_size_;
+  int compute_units_;
+  bool wdl_;
+  bool moves_left_;
+  bool use_res_block_winograd_fuse_opt_;  // fuse operations inside the residual
+                                          // tower
+  bool multi_stream_;                     // run multiple parallel network evals
+  bool allow_cache_opt_;  // try to fit residual block activations in L2 cache
+
+
+  // Currently only one NN Eval can happen a time (we can fix this if needed
+  // by allocating more memory).
+  mutable std::mutex lock_;
+  sycl::queue* sycl_queue_;
+  bool is_cpu_;
+
+
+  int numBlocks_;
+  int numFilters_;
+  bool has_se_;
+  bool conv_policy_;
+  bool attn_policy_;
+  bool attn_body_;
+  int num_encoder_blocks_;
+  std::vector<std::unique_ptr<BaseLayer<DataType>>> network_;
+  BaseLayer<DataType>* getLastLayer() { return network_.back().get(); }
+
+  BaseLayer<DataType>* resi_last_;
+  BaseLayer<DataType>* encoder_last_;
+
+  size_t tensor_mem_size_;
+  size_t scratch_size_;
+
+  // this copy is used only for initialization when multi-stream is enabled
+  void* scratch_mem_;
+  // this is only used when multi-stream is disabled
+  void** offset_pointers_ = nullptr;
+  void** head_offset_pointers_ = nullptr;
+
+  bool has_tensor_cores_;
+
+  // not used when multi-steam is enabled
+  //dpct::queue_ptr cublas_;
+  DataType* tensor_mem_[3];
+
+  mutable std::mutex inputs_outputs_lock_;
+  std::list<std::unique_ptr<InputsOutputs>> free_inputs_outputs_;
+
+  void showDeviceInfo(const sycl::queue &mqueue) const {
+    CERR << "Device-Info...";
+    CERR << "Platform: " 
+         << mqueue.get_device().get_platform().get_info<sycl::info::platform::name>() 
+         << " selected";
+    std::string device_type = mqueue.get_device().is_gpu() ? "GPU" : "CPU";
+    CERR << device_type << ": " 
+         << mqueue.get_device().get_info<sycl::info::device::name>();
+    CERR << device_type << ": " 
+         << mqueue.get_device().get_info<sycl::info::device::max_mem_alloc_size>() / (1024 * 1024) 
+         << " MB (max allocation)";
+    CERR << device_type << " clock frequency: " 
+         << mqueue.get_device().get_info<sycl::info::device::max_clock_frequency>() 
+         << " MHz";
+    CERR << "L2 cache capacity: " 
+         << mqueue.get_device().get_info<sycl::info::device::local_mem_size>() / (1024) 
+         << " KB";
+    CERR << "Global memory size: " 
+         << mqueue.get_device().get_info<sycl::info::device::global_mem_size>() / (1024 * 1024) 
+         << " MB";         
+    CERR << "...Device-Info-End";
+    }
+    
+    void showPlatformInfo(const std::vector<sycl::platform>& platforms) {
+       CERR << "Platform-List...";
+       for (size_t i = 0; i < platforms.size(); ++i) {
+           std::string version = platforms[i].get_info<sycl::info::platform::version>();
+           
+           for (const auto& device : platforms[i].get_devices()) {
+               std::string device_type;
+               switch (device.get_info<sycl::info::device::device_type>()) {
+                   case sycl::info::device_type::gpu: 
+                       device_type = "GPU"; break;
+                   case sycl::info::device_type::cpu: 
+                       device_type = "CPU"; break;
+                   default: 
+                       device_type = "Other"; break;
+                }
+                CERR << "Platform " << i << " (version: " << version << "):" << device_type
+                     << " (Name" << ": " 
+                     << device.get_platform().get_info<sycl::info::platform::name>() << ")";
+            }
+        }
+        
+        CERR << "...Platform-List-End";
+    }
+};
+
+template <typename DataType>
+SyclNetworkComputation<DataType>::SyclNetworkComputation(
+    SyclNetwork<DataType>* network, bool wdl, bool moves_left)
+    : wdl_(wdl), moves_left_(moves_left), network_(network) {
+  batch_size_ = 0;
+  inputs_outputs_ = network_->GetInputsOutputs();
+}
+
+template <typename DataType>
+SyclNetworkComputation<DataType>::~SyclNetworkComputation() {
+  network_->ReleaseInputsOutputs(std::move(inputs_outputs_));
+}
+
+template <typename DataType>
+void SyclNetworkComputation<DataType>::ComputeBlocking() {
+  network_->forwardEval(inputs_outputs_.get(), GetBatchSize());
+}
+
+template <typename DataType>
+std::unique_ptr<Network> MakeSyclNetwork(const std::optional<WeightsFile>& w,
+                                         const OptionsDict& options) {
+  if (!w) {
+    throw Exception(
+        "The sycl" +
+        std::string(std::is_same<sycl::half, DataType>::value ? "-fp16" : "") +
+        " backend requires a network file.");
+  }
+  const WeightsFile& weights = *w;
+  auto nf = weights.format().network_format();
+  using NF = pblczero::NetworkFormat;
+  switch (nf.network()) {
+    case NF::NETWORK_CLASSICAL_WITH_HEADFORMAT:
+    case NF::NETWORK_SE_WITH_HEADFORMAT:
+    case NF::NETWORK_ATTENTIONBODY_WITH_HEADFORMAT:
+    case NF::NETWORK_ATTENTIONBODY_WITH_MULTIHEADFORMAT:
+      break;
+    default:
+      throw Exception("Network format " +
+                      NF::NetworkStructure_Name(nf.network()) +
+                      " is not supported by the SYCL backend.");
+  }
+  switch (nf.policy()) {
+    case NF::POLICY_CLASSICAL:
+    case NF::POLICY_CONVOLUTION:
+    case NF::POLICY_ATTENTION:
+      break;
+    default:
+      throw Exception("Policy format " + NF::PolicyFormat_Name(nf.policy()) +
+                      " is not supported by the SYCL backend.");
+  }
+  switch (nf.value()) {
+    case NF::VALUE_CLASSICAL:
+    case NF::VALUE_WDL:
+      break;
+    default:
+      throw Exception("Value format " + NF::ValueFormat_Name(nf.value()) +
+                      " is not supported by the SYCL backend.");
+  }
+  switch (nf.moves_left()) {
+    case NF::MOVES_LEFT_NONE:
+    case NF::MOVES_LEFT_V1:
+      break;
+    default:
+      throw Exception("Moves left head format " +
+                      NF::MovesLeftFormat_Name(nf.moves_left()) +
+                      " is not supported by the SYCL backend.");
+  }
+  switch (nf.default_activation()) {
+    case NF::DEFAULT_ACTIVATION_RELU:
+    case NF::DEFAULT_ACTIVATION_MISH:
+      break;
+    default:
+      throw Exception("Default activation " +
+                      NF::DefaultActivation_Name(nf.default_activation()) +
+                      " is not supported by the SYCL backend.");
+  }
+  switch (nf.input_embedding()) {
+    case NF::INPUT_EMBEDDING_NONE:
+    case NF::INPUT_EMBEDDING_PE_MAP:
+    case NF::INPUT_EMBEDDING_PE_DENSE:
+      break;
+    default:
+      throw Exception("Input embedding " +
+                      NF::InputEmbeddingFormat_Name(nf.input_embedding()) +
+                      " is not supported by the SYCL backend.");
+  }
+  return std::make_unique<SyclNetwork<DataType>>(weights, options);
+}
+
+std::unique_ptr<Network> MakeSyclNetworkAuto(
+    const std::optional<WeightsFile>& weights, const OptionsDict& options) {
+  int gpu_id = options.GetOrDefault<int>("gpu", 0);
+
+  auto devices = sycl::device::get_devices();
+  if (gpu_id >= devices.size()) {
+      throw Exception("Invalid GPU ID");
+   }
+  CERR << "Trying to switch to [sycl-fp16]...";
+  if (devices[gpu_id].has(sycl::aspect::fp16)) {
+    CERR << "Switched to [sycl-fp16]..."; 
+    return MakeSyclNetwork<sycl::half>(weights, options);     
+  } else {
+    CERR << "Device does not support sycl-fp16";
+  }
+  CERR << "Switched to [sycl]...";
+  return MakeSyclNetwork<float>(weights, options);
+}
+
+REGISTER_NETWORK("sycl-auto", MakeSyclNetworkAuto, 132)
+REGISTER_NETWORK("sycl", MakeSyclNetwork<float>, 131)
+REGISTER_NETWORK("sycl-fp16", MakeSyclNetwork<sycl::half>, 130)
+
+}  // namespace lczero
diff --git a/src/neural/backends/sycl/sycl_common.h b/src/neural/backends/sycl/sycl_common.h
new file mode 100644
index 0000000000..bbaee55645
--- /dev/null
+++ b/src/neural/backends/sycl/sycl_common.h
@@ -0,0 +1,61 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2018-2024 The LCZero Authors
+  Copyright (C) 2023 Intel Corporation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+   
+  SPDX-License-Identifier:GNU General Public License v3.0 or later
+*/
+
+#pragma once
+
+#include <sycl/sycl.hpp>
+
+#include "utils/exception.h"
+
+#if defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
+#define SYCL_SUB_GROUP_SIZE 64
+#else
+#define SYCL_SUB_GROUP_SIZE 32
+#endif
+
+namespace lczero {
+namespace sycldnn_backend {
+
+static constexpr int kNumOutputPolicy = 1858;
+
+// max supported filter count for fast path
+// TODO: extend it to cover bigger networks!
+// (We are limited by no of registers per thread)
+static constexpr int kMaxResBlockFusingChannels = 384;  // limit on num_filters
+static constexpr int kMaxResBlockFusingSeKFp16Ampere =
+    512;  // (use a different kernel with reduced register pressure)
+static constexpr int kMaxResBlockFusingSeK =
+    128;  // limit on (num_filters / se_ratio)
+static constexpr int kMaxResBlockFusingSeFp16AmpereSmem =
+    72 * kMaxResBlockFusingSeKFp16Ampere *
+    sizeof(sycl::half);  // shared memory used by the special
+                         // kernel
+
+#ifdef USE_CUBLAS
+void CublasError(int status, const char* file, const int& line);
+
+#define ReportCUBLASErrors(status) CublasError(status, __FILE__, __LINE__)
+#endif
+
+inline int DivUp(int a, int b) { return (a + b - 1) / b; }
+
+}  // namespace sycldnn_backend
+}  // namespace lczero
diff --git a/src/neural/backends/sycl/winograd_helper.h b/src/neural/backends/sycl/winograd_helper.h
new file mode 100644
index 0000000000..175b925506
--- /dev/null
+++ b/src/neural/backends/sycl/winograd_helper.h
@@ -0,0 +1,961 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2020-2024 The LCZero Authors
+  Copyright (C) 2023 Intel Corporation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+   
+  SPDX-License-Identifier:GNU General Public License v3.0 or later
+*/
+
+#include <sycl/sycl.hpp>
+
+namespace lczero {
+namespace sycldnn_backend {
+
+[[gnu::always_inline]]
+inline float mishActivate(float el) {
+  auto e = sycl::native::exp(el);
+  auto n = e * e + 2.0f * e;
+  auto d = el / (n + 2.0f);
+  if (el <= -0.6f) {
+    return n * d;
+  } else {
+    return el - 2.0f * d;
+  }
+}
+[[gnu::always_inline]]
+inline float activate(float cVal, ActivationFunction activation) {
+  switch (activation) {
+    case ACTIVATION_RELU:
+      if (cVal < 0) cVal = 0;
+      break;
+    case ACTIVATION_RELU_2:
+      if (cVal < 0) cVal = 0;
+      cVal *= cVal;
+      break;
+    case ACTIVATION_TANH:
+      cVal = sycl::tanh(cVal);
+      break;
+    case ACTIVATION_SIGMOID:
+      cVal = 1.0f / (1.0f + sycl::native::exp(-cVal));
+      break;
+    case ACTIVATION_SELU: {
+      float alpha = 1.67326324f, scale = 1.05070098f;
+      if (cVal > 0)
+        cVal = scale * cVal;
+      else
+        cVal = scale * alpha * (sycl::native::exp(cVal) - 1.0f);
+      break;
+    }
+    case ACTIVATION_MISH:
+      cVal = mishActivate(cVal);
+      break;
+    case ACTIVATION_SWISH:
+      cVal /= (1.0f + sycl::native::exp(-cVal));
+      break;
+  }
+  return cVal;
+}
+
+template <typename T, int M, int N, int K>
+[[gnu::always_inline]]
+inline void matrixMul_gpu_serial(T* c, const T* a, const T* b) {
+#pragma unroll
+  for (int i = 0; i < M; ++i)
+#pragma unroll
+    for (int j = 0; j < N; ++j) {
+      T S = 0;
+#pragma unroll
+      for (int k = 0; k < K; ++k) S += a[i * K + k] * b[k * N + j];
+      c[i * N + j] = S;
+    }
+}
+
+template <typename T>
+[[gnu::always_inline]]
+inline void FilterTransform4x4(T* transformed_filter,
+                                        const T* filter) {
+  // transform applied to filter (of size 3x3)
+  T G[6 * 3] = {1.0f / 4,  0,         0,         -1.0f / 6,  -1.0f / 6,
+                -1.0f / 6, -1.0f / 6, 1.0f / 6,  -1.0f / 6,  1.0f / 24,
+                1.0f / 12, 1.0f / 6,  1.0f / 24, -1.0f / 12, 1.0f / 6,
+                0,         0,         1};
+
+  T Gt[3 * 6] = {1.0f / 4, -1.0f / 6, -1.0f / 6, 1.0f / 24, 1.0f / 24,  0,
+                 0,        -1.0f / 6, 1.0f / 6,  1.0f / 12, -1.0f / 12, 0,
+                 0,        -1.0f / 6, -1.0f / 6, 1.0f / 6,  1.0f / 6,   1};
+
+  T temp_filter[6 * 3];
+  matrixMul_gpu_serial<T, 6, 3, 3>(temp_filter, G, filter);
+  matrixMul_gpu_serial<T, 6, 6, 3>(transformed_filter, temp_filter, Gt);
+}
+
+template <typename T>
+[[gnu::always_inline]]
+inline void InputTransform4x4(T* transformedInput, const T* input) {
+  // transform applied to input tile (of size 4x4)
+  const T Bt[6 * 6] = {4, 0, -5, 0,  1, 0, 0, -4, -4, 1,  1, 0,
+                       0, 4, -4, -1, 1, 0, 0, -2, -1, 2,  1, 0,
+                       0, 2, -1, -2, 1, 0, 0, 4,  0,  -5, 0, 1};
+
+  const T B[6 * 6] = {4,  0,  0,  0,  0,  0, 0, -4, 4,  -2, 2,  4,
+                      -5, -4, -4, -1, -1, 0, 0, 1,  -1, 2,  -2, -5,
+                      1,  1,  1,  1,  1,  0, 0, 0,  0,  0,  0,  1};
+
+  T tempIp1[6 * 6];
+  matrixMul_gpu_serial<T, 6, 6, 6>(tempIp1, Bt, input);
+  matrixMul_gpu_serial<T, 6, 6, 6>(transformedInput, tempIp1, B);
+}
+
+template <typename T>
+[[gnu::always_inline]]
+inline void OutputTransform4x4(T* output, const T* transformedOutput) {
+  // transform applied to result
+  const T At[4 * 6] = {1, 1, 1, 1, 1, 0, 0, 1, -1, 2, -2, 0,
+                       0, 1, 1, 4, 4, 0, 0, 1, -1, 8, -8, 1};
+
+  const T A[6 * 4] = {1, 0, 0, 0, 1, 1,  1, 1,  1, -1, 1, -1,
+                      1, 2, 4, 8, 1, -2, 4, -8, 0, 0,  0, 1};
+
+  T tempOp[4 * 6];
+  matrixMul_gpu_serial<T, 4, 6, 6>(tempOp, At, transformedOutput);
+  matrixMul_gpu_serial<T, 4, 4, 6>(output, tempOp, A);
+}
+
+#define FILTER_IDX_NCHW(k, c, h, w) ((k)*C * S * R + (c)*S * R + (h)*R + w)
+template <typename T>
+void filterTransform_kernel(int K, int C, int elements,
+                                       T* transformed_filter, const T* filter,
+                                       const sycl::nd_item<3> &item_ct1) {
+  int tid = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
+            item_ct1.get_local_id(2);
+  if (tid >= elements) return;
+
+  constexpr int S = 3;
+  constexpr int R = 3;
+
+  int c = tid % C;
+  int k = tid / C;
+
+  T filter_tile[3][3];
+  T transformed_tile[6][6];
+
+  // read input from memory
+  for (int s = 0; s < S; s++)
+    for (int r = 0; r < R; r++) {
+      filter_tile[s][r] = filter[FILTER_IDX_NCHW(k, c, s, r)];
+    }
+
+  // transform it
+  FilterTransform4x4(&(transformed_tile[0][0]), &(filter_tile[0][0]));
+
+  // write to output (output is in HWCK layout)
+  for (int i = 0; i < 6; i++)
+    for (int j = 0; j < 6; j++) {
+      transformed_filter[i * 6 * C * K + j * C * K + c * K + k] =
+          transformed_tile[i][j];
+    }
+}
+
+#define INDEX_NCHW(n, c, h, w) ((n)*C * 8 * 8 + (c)*8 * 8 + (h)*8 + w)
+#define INDEX_NHCW(n, c, h, w) ((n)*C * 8 * 8 + (h)*C * 8 + (c)*8 + w)
+
+// index in intermediate/temp tensor
+// W, H == 6 here! (6x6 transformed blocks)
+// N also includes part of dimension (2x2)
+#define GemmN (N * 4)
+#define TEMP_INDEX_HWNC(h, w, n, c) \
+  ((h)*6 * GemmN * C + (w)*GemmN * C + (n)*C + c)
+
+// 'C' threads per block
+// 'N' blocks
+// every thread transforms an entire board/plane (8x8 elements)
+// - producing 4 x 6x6 elements
+template <typename T, bool nhcw>
+void InputTransform_kernel(int N, int C, const T* input, T* output,
+                           const sycl::nd_item<3> &item_ct1) {
+  int c = item_ct1.get_local_id(2);
+  int n = item_ct1.get_group(2);
+
+  T board[8][8];
+
+  const bool fp16 = std::is_same<sycl::half, T>::value;
+
+// read the board (a row at a time for fp16)
+#pragma unroll
+  for (int y = 0; y < 8; y++) {
+    if (nhcw) {
+      *((sycl::uint4*)(&board[y][0])) =
+          *((sycl::uint4*)(&input[INDEX_NHCW(n, c, y, 0)]));
+      if (!fp16)
+        *((sycl::uint4*)(&board[y][4])) =
+            *((sycl::uint4*)(&input[INDEX_NHCW(n, c, y, 4)]));
+    } else {
+      *((sycl::uint4*)(&board[y][0])) =
+          *((sycl::uint4*)(&input[INDEX_NCHW(n, c, y, 0)]));
+      if (!fp16)
+        *((sycl::uint4*)(&board[y][4])) =
+            *((sycl::uint4*)(&input[INDEX_NCHW(n, c, y, 4)]));
+    }
+  }
+
+  // top-left
+  {
+    T inEl[6][6] = {};
+
+#pragma unroll
+    for (int i = 0; i < 5; i++)
+#pragma unroll
+      for (int j = 0; j < 5; j++) inEl[i + 1][j + 1] = board[i][j];
+
+    InputTransform4x4(&inEl[0][0], &inEl[0][0]);
+
+#pragma unroll
+    for (int y = 0; y < 6; y++)
+#pragma unroll
+      for (int x = 0; x < 6; x++)
+        output[TEMP_INDEX_HWNC(y, x, n * 4 + 0, c)] = inEl[y][x];
+  }
+
+  // top-right
+  {
+    T inEl[6][6] = {};
+
+#pragma unroll
+    for (int i = 0; i < 5; i++)
+#pragma unroll
+      for (int j = 0; j < 5; j++) inEl[i + 1][j] = board[i][j + 3];
+
+    InputTransform4x4(&inEl[0][0], &inEl[0][0]);
+
+#pragma unroll
+    for (int y = 0; y < 6; y++)
+#pragma unroll
+      for (int x = 0; x < 6; x++)
+        output[TEMP_INDEX_HWNC(y, x, n * 4 + 1, c)] = inEl[y][x];
+  }
+
+  // bottom-left
+  {
+    T inEl[6][6] = {};
+
+#pragma unroll
+    for (int i = 0; i < 5; i++)
+#pragma unroll
+      for (int j = 0; j < 5; j++) inEl[i][j + 1] = board[i + 3][j];
+
+    InputTransform4x4(&inEl[0][0], &inEl[0][0]);
+
+#pragma unroll
+    for (int y = 0; y < 6; y++)
+#pragma unroll
+      for (int x = 0; x < 6; x++)
+        output[TEMP_INDEX_HWNC(y, x, n * 4 + 2, c)] = inEl[y][x];
+  }
+
+  // bottom-right
+  {
+    T inEl[6][6] = {};
+
+#pragma unroll
+    for (int i = 0; i < 5; i++)
+#pragma unroll
+      for (int j = 0; j < 5; j++) inEl[i][j] = board[i + 3][j + 3];
+
+    InputTransform4x4(&inEl[0][0], &inEl[0][0]);
+
+#pragma unroll
+    for (int y = 0; y < 6; y++)
+#pragma unroll
+      for (int x = 0; x < 6; x++)
+        output[TEMP_INDEX_HWNC(y, x, n * 4 + 3, c)] = inEl[y][x];
+  }
+}
+
+#define readw1(row, col) (w1[(row)*se_K + (col)])
+#define readw2(row, col) (w2[(row)*2 * C + (col)])
+
+// input is in transformed space (HWNC layout)
+// output is NCHW
+// 'C' threads per block
+// 'N' blocks
+// every thread generates an entire board/plane (8x8 elements)
+template <typename T, bool use_se, ActivationFunction activation, bool use_bias,
+          bool use_skip, bool skipInput_nhcw, bool output_nhcw>
+void OutputTransform_kernel(int N, int C, int se_K, T* output,
+                                       const T* input, const T* skip,
+                                       const T* bias, const T* w1, const T* b1,
+                                       const T* w2, const T* b2,
+                                       const sycl::nd_item<3> &item_ct1,
+                                       float *shared_data) {
+  const bool fp16 = std::is_same<sycl::half, T>::value;
+
+  int k = item_ct1.get_local_id(2);
+  int n = item_ct1.get_group(2);
+
+  T board[8][8];
+  T b = bias[k];
+
+#pragma unroll
+  for (int hStart = 0; hStart < 8; hStart += 4)
+#pragma unroll
+    for (int wStart = 0; wStart < 8; wStart += 4) {
+      //  i) read to per thread registers (for doing output transform)
+      int shln = n * 4 + (hStart / 4) * 2 + (wStart / 4);
+      T outElTransformed[6][6];
+#pragma unroll
+      for (int y = 0; y < 6; y++)
+#pragma unroll
+        for (int x = 0; x < 6; x++)
+          outElTransformed[y][x] = input[TEMP_INDEX_HWNC(y, x, shln, k)];
+
+      // ii) transform it
+      T outEl[4][4];
+      OutputTransform4x4(&outEl[0][0], &outElTransformed[0][0]);
+
+#pragma unroll
+      for (int y = 0; y < 4; y++)
+#pragma unroll
+        for (int x = 0; x < 4; x++) board[hStart + y][wStart + x] = outEl[y][x];
+    }
+
+  // Add bias, and compute the average for SE.
+  float S = 0;
+  float B = 0;
+
+#pragma unroll
+  for (int y = 0; y < 8; y++)
+#pragma unroll
+    for (int x = 0; x < 8; x++) {
+      if (use_bias) board[y][x] += b;
+      if (use_se) S += (float)board[y][x];
+    }
+
+  if (use_se) {
+    float avg = S / 64;
+    shared_data[k] = avg;
+    /*
+    DPCT1065:38: Consider replacing sycl::nd_item::barrier() with
+    sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+    performance if there is no access to global memory.
+    */
+    item_ct1.barrier();
+
+    // First fully-connected layer for SE
+    if (k < se_K) {
+      S = 0;
+      for (int i = 0; i < C; i++) {
+        S += shared_data[i] * float(readw1(i, k));
+      }
+      S += (float)b1[k];
+      S = activate(S, activation);
+    }
+    /*
+    DPCT1065:39: Consider replacing sycl::nd_item::barrier() with
+    sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+    performance if there is no access to global memory.
+    */
+    item_ct1.barrier();
+    if (k < se_K) {
+      shared_data[k] = S;
+    }
+    /*
+    DPCT1065:40: Consider replacing sycl::nd_item::barrier() with
+    sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+    performance if there is no access to global memory.
+    */
+    item_ct1.barrier();
+
+    // Second fully-connected layer for SE
+    S = 0;
+    for (int i = 0; i < se_K; i++) {
+      float val = shared_data[i];
+      S += val * float(readw2(i, k));
+      B += val * float(readw2(i, k + C));
+    }
+    S += (float)b2[k];
+    B += (float)b2[k + C];
+
+    // Sigmoid (only on the scale part).
+    S = 1.0f / (1.0f + sycl::exp(-S));
+  }
+
+  // Scale/bias, add skip connection, perform relu, and write to output.
+  for (int h = 0; h < 8; h++) {
+    if (use_se)
+#pragma unroll
+      for (int w = 0; w < 8; w++) board[h][w] = (T)(float(board[h][w]) * S + B);
+
+    // residual add
+    if (use_skip) {
+      T skipInp[8];
+      if (skipInput_nhcw) {
+        *((sycl::uint4*)(&skipInp[0])) =
+            *((sycl::uint4*)(&skip[INDEX_NHCW(n, k, h, 0)]));
+        if (!fp16)
+          *((sycl::uint4*)(&skipInp[4])) =
+              *((sycl::uint4*)(&skip[INDEX_NHCW(n, k, h, 4)]));
+      } else {
+        *((sycl::uint4*)(&skipInp[0])) =
+            *((sycl::uint4*)(&skip[INDEX_NCHW(n, k, h, 0)]));
+        if (!fp16)
+          *((sycl::uint4*)(&skipInp[4])) =
+              *((sycl::uint4*)(&skip[INDEX_NCHW(n, k, h, 4)]));
+      }
+#pragma unroll
+      for (int w = 0; w < 8; w++) board[h][w] += skipInp[w];
+    }
+
+    // relu
+    if (activation != ACTIVATION_NONE) {
+#pragma unroll
+      for (int w = 0; w < 8; w++)
+        board[h][w] = (T)activate((float)board[h][w], activation);
+    }
+
+    // Write to output (use 128 bit writes to store one row a time)
+    if (output_nhcw) {
+      *((sycl::uint4*)(&output[INDEX_NHCW(n, k, h, 0)])) =
+          *((sycl::uint4*)&board[h][0]);
+      if (!fp16)
+        *((sycl::uint4*)(&output[INDEX_NHCW(n, k, h, 4)])) =
+            *((sycl::uint4*)&board[h][4]);
+    } else {
+      *((sycl::uint4*)(&output[INDEX_NCHW(n, k, h, 0)])) =
+          *((sycl::uint4*)&board[h][0]);
+      if (!fp16)
+        *((sycl::uint4*)(&output[INDEX_NCHW(n, k, h, 4)])) =
+            *((sycl::uint4*)&board[h][4]);
+    }
+  }
+}
+
+// fast reduction for the warp
+[[gnu::always_inline]]
+inline float warpReduce(float x, const sycl::nd_item<3>& item_ct1) {
+#pragma unroll
+  for (int mask = 16; mask > 0; mask >>= 1)
+    /*
+    DPCT1023:4: The SYCL sub-group does not support mask options for
+    dpct::permute_sub_group_by_xor. You can specify
+    "--use-experimental-features=masked-sub-group-operation" to use the
+    experimental helper function to migrate __shfl_xor_sync.
+    */
+    /*
+    DPCT1096:122: The right-most dimension of the work-group used in the SYCL
+    kernel that calls this function may be less than "32". The function
+    "dpct::permute_sub_group_by_xor" may return an unexpected result on the CPU
+    device. Modify the size of the work-group to ensure that the value of the
+    right-most dimension is a multiple of "32".
+    */
+    x += sycl::permute_group_by_xor(item_ct1.get_sub_group(), x, mask);
+
+  return x;
+}
+
+// fast max reduction for the warp
+[[gnu::always_inline]]
+inline float warpMax(float x, const sycl::nd_item<3>& item_ct1) {
+#pragma unroll
+  for (int mask = 16; mask > 0; mask >>= 1)
+    /*
+    DPCT1023:5: The SYCL sub-group does not support mask options for
+    dpct::permute_sub_group_by_xor. You can specify
+    "--use-experimental-features=masked-sub-group-operation" to use the
+    experimental helper function to migrate __shfl_xor_sync.
+    */
+    /*
+    DPCT1096:123: The right-most dimension of the work-group used in the SYCL
+    kernel that calls this function may be less than "32". The function
+    "dpct::permute_sub_group_by_xor" may return an unexpected result on the CPU
+    device. Modify the size of the work-group to ensure that the value of the
+    right-most dimension is a multiple of "32".
+    */
+    x = sycl::max(x, (float)(sycl::permute_group_by_xor(
+                         item_ct1.get_sub_group(), x, mask)));
+
+  return x;
+}
+
+// Helper fuction to do vector loads/stores
+template <typename T>
+[[gnu::always_inline]]
+inline void copyAs(void* dst, const void* src) {
+  *((T*)(dst)) = *((const T*)(src));
+}
+
+// input is in transformed space (HWNC layout) --- output of GEMM
+// output is also in transformed space (HWNC layout) --- input to GEMM (for next
+// layer)
+// 'C' threads per block
+// 'N' blocks
+// every thread generates an entire board/plane (8x8 elements)
+template <typename T, ActivationFunction activation, bool use_bias,
+          bool use_skip>
+/*
+DPCT1110:6: The total declared local variable size in device function
+OutputTransform_SE_relu_InputTransform_kernel exceeds 128 bytes and may cause
+high register pressure. Consult with your hardware vendor to find the total
+register size available and adjust the code, or use smaller sub-group size to
+avoid high register pressure.
+*/
+void OutputTransform_SE_relu_InputTransform_kernel(
+    int N, int C, int se_K, T* output, const T* input, T* skip, const T* bias,
+    const T* w1, const T* b1, const T* w2, const T* b2,
+    const sycl::nd_item<3>& item_ct1, float* shared_data,
+    sycl::local_accessor<float, 2> shared_sums) {
+  const bool fp16 = std::is_same<sycl::half, T>::value;
+
+  int k = item_ct1.get_local_id(2);
+  int n = item_ct1.get_group(2);
+
+  T board[8][8];
+  T b = bias[k];
+
+#pragma unroll
+  for (int hStart = 0; hStart < 8; hStart += 4)
+#pragma unroll
+    for (int wStart = 0; wStart < 8; wStart += 4) {
+      //  i) read to per thread registers (for doing output transform)
+      int shln = n * 4 + (hStart / 4) * 2 + (wStart / 4);
+      T outElTransformed[6][6];
+#pragma unroll
+      for (int y = 0; y < 6; y++)
+#pragma unroll
+        for (int x = 0; x < 6; x++)
+          outElTransformed[y][x] = input[TEMP_INDEX_HWNC(y, x, shln, k)];
+
+      // ii) transform it
+      T outEl[4][4];
+      OutputTransform4x4(&outEl[0][0], &outElTransformed[0][0]);
+
+#pragma unroll
+      for (int y = 0; y < 4; y++)
+#pragma unroll
+        for (int x = 0; x < 4; x++) board[hStart + y][wStart + x] = outEl[y][x];
+    }
+
+  // Add bias, and compute the average for SE.
+  float S = 0;
+  float B = 0;
+
+#pragma unroll
+  for (int y = 0; y < 8; y++)
+#pragma unroll
+    for (int x = 0; x < 8; x++) {
+      if (use_bias) board[y][x] += b;
+      S += (float)board[y][x];
+    }
+
+  {
+    float avg = S / 64;
+    shared_data[k] = avg;
+
+    int lane = k & 0x1F;
+    int warp = k >> 5;
+    /*
+    DPCT1065:41: Consider replacing sycl::nd_item::barrier() with
+    sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+    performance if there is no access to global memory.
+    */
+    item_ct1.barrier();
+
+    // First fully-connected layer for SE
+
+    // As se_K << C, we want to loop over se_K instead of C
+    // even if it means taking the sum across threads
+
+      // per-warp sums
+
+    for (int i = 0; i < se_K; i++) {
+      float val = shared_data[k] * float(readw1(k, i));
+      val = warpReduce(val, item_ct1);
+      if (lane == 0) shared_sums[warp][i] = val;
+    }
+    /*
+    DPCT1065:42: Consider replacing sycl::nd_item::barrier() with
+    sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+    performance if there is no access to global memory.
+    */
+    item_ct1.barrier();
+    if (k < se_K) {
+      S = 0;
+      for (int i = 0; i < C / 32; i++) S += shared_sums[i][k];
+
+      S += (float)b1[k];
+      S = activate(S, activation);
+      shared_data[k] = S;
+    }
+
+    /*
+    DPCT1065:43: Consider replacing sycl::nd_item::barrier() with
+    sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+    performance if there is no access to global memory.
+    */
+    item_ct1.barrier();
+
+    // Second fully-connected layer for SE
+    S = 0;
+    for (int i = 0; i < se_K; i++) {
+      float val = shared_data[i];
+      S += val * float(readw2(i, k));
+      B += val * float(readw2(i, k + C));
+    }
+    S += (float)b2[k];
+    B += (float)b2[k + C];
+
+    // Sigmoid (only on the scale part).
+    S = 1.0f / (1.0f + sycl::exp(-S));
+  }
+
+  // Scale/bias, add skip connection, perform relu, and write to output.
+  for (int h = 0; h < 8; h++) {
+#pragma unroll
+    for (int w = 0; w < 8; w++) board[h][w] = (T)(float(board[h][w]) * S + B);
+
+    // residual add
+    if (use_skip) {
+      T skipInp[8];
+      copyAs<sycl::uint4>(&skipInp[0], &skip[INDEX_NHCW(n, k, h, 0)]);
+      if (!fp16)
+          copyAs<sycl::uint4>(&skipInp[4], &skip[INDEX_NHCW(n, k, h, 4)]);
+#pragma unroll
+      for (int w = 0; w < 8; w++) board[h][w] += skipInp[w];
+    }
+
+    // relu
+    if (activation != ACTIVATION_NONE) {
+#pragma unroll
+      for (int w = 0; w < 8; w++)
+        board[h][w] = (T)activate((float)board[h][w], activation);
+    }
+
+    // write un-transformed output to 'skip' if required
+    if (use_skip) {
+      // Write to skip (use 128 bit writes to store one row a time)
+      copyAs<sycl::uint4>(&skip[INDEX_NHCW(n, k, h, 0)], &board[h][0]);
+      if (!fp16)
+          copyAs<sycl::uint4>(&skip[INDEX_NHCW(n, k, h, 4)], &board[h][4]);
+    }
+  }
+
+  // perform input transform
+
+  int c = k;
+  // top-left
+  {
+    T inEl[6][6] = {};
+
+#pragma unroll
+    for (int i = 0; i < 5; i++)
+#pragma unroll
+      for (int j = 0; j < 5; j++) inEl[i + 1][j + 1] = board[i][j];
+
+    InputTransform4x4(&inEl[0][0], &inEl[0][0]);
+
+#pragma unroll
+    for (int y = 0; y < 6; y++)
+#pragma unroll
+      for (int x = 0; x < 6; x++)
+        output[TEMP_INDEX_HWNC(y, x, n * 4 + 0, c)] = inEl[y][x];
+  }
+
+  // top-right
+  {
+    T inEl[6][6] = {};
+
+#pragma unroll
+    for (int i = 0; i < 5; i++)
+#pragma unroll
+      for (int j = 0; j < 5; j++) inEl[i + 1][j] = board[i][j + 3];
+
+    InputTransform4x4(&inEl[0][0], &inEl[0][0]);
+
+#pragma unroll
+    for (int y = 0; y < 6; y++)
+#pragma unroll
+      for (int x = 0; x < 6; x++)
+        output[TEMP_INDEX_HWNC(y, x, n * 4 + 1, c)] = inEl[y][x];
+  }
+
+  // bottom-left
+  {
+    T inEl[6][6] = {};
+
+#pragma unroll
+    for (int i = 0; i < 5; i++)
+#pragma unroll
+      for (int j = 0; j < 5; j++) inEl[i][j + 1] = board[i + 3][j];
+
+    InputTransform4x4(&inEl[0][0], &inEl[0][0]);
+
+#pragma unroll
+    for (int y = 0; y < 6; y++)
+#pragma unroll
+      for (int x = 0; x < 6; x++)
+        output[TEMP_INDEX_HWNC(y, x, n * 4 + 2, c)] = inEl[y][x];
+  }
+
+  // bottom-right
+  {
+    T inEl[6][6] = {};
+
+#pragma unroll
+    for (int i = 0; i < 5; i++)
+#pragma unroll
+      for (int j = 0; j < 5; j++) inEl[i][j] = board[i + 3][j + 3];
+
+    InputTransform4x4(&inEl[0][0], &inEl[0][0]);
+
+#pragma unroll
+    for (int y = 0; y < 6; y++)
+#pragma unroll
+      for (int x = 0; x < 6; x++)
+        output[TEMP_INDEX_HWNC(y, x, n * 4 + 3, c)] = inEl[y][x];
+  }
+}
+
+constexpr int kOpInpTransformBlockSize = 64;
+template <typename T, ActivationFunction activation, bool use_bias,
+          bool use_skip>
+/*
+DPCT1110:7: The total declared local variable size in device function
+OutputTransform_relu_InputTransform_kernel exceeds 128 bytes and may cause high
+register pressure. Consult with your hardware vendor to find the total register
+size available and adjust the code, or use smaller sub-group size to avoid high
+register pressure.
+*/
+void OutputTransform_relu_InputTransform_kernel(
+    int N, int C, T* output, const T* input, T* skip, const T* bias,
+    const sycl::nd_item<3>& item_ct1) {
+  const bool fp16 = std::is_same<sycl::half, T>::value;
+
+  int k = item_ct1.get_local_id(2) +
+          item_ct1.get_group(2) * kOpInpTransformBlockSize;
+  if (k >= C) return;  // wasted threads (for non-multiple of 64 channel counts)
+  int n = item_ct1.get_group(1);
+
+  T board[8][8];
+  T b = bias[k];
+
+  T skipInp[8][8];
+#pragma unroll
+  for (int h = 0; h < 8; h++) {
+    copyAs<sycl::uint4>(&skipInp[h][0], &skip[INDEX_NHCW(n, k, h, 0)]);
+    if (!fp16)
+        copyAs<sycl::uint4>(&skipInp[h][4], &skip[INDEX_NHCW(n, k, h, 4)]);
+  }
+
+#pragma unroll
+  for (int hStart = 0; hStart < 8; hStart += 4)
+#pragma unroll
+    for (int wStart = 0; wStart < 8; wStart += 4) {
+      //  i) read to per thread registers (for doing output transform)
+      int shln = n * 4 + (hStart / 4) * 2 + (wStart / 4);
+      T outElTransformed[6][6];
+#pragma unroll
+      for (int y = 0; y < 6; y++)
+#pragma unroll
+        for (int x = 0; x < 6; x++)
+          outElTransformed[y][x] = input[TEMP_INDEX_HWNC(y, x, shln, k)];
+
+      // ii) transform it
+      T outEl[4][4];
+      OutputTransform4x4(&outEl[0][0], &outElTransformed[0][0]);
+
+#pragma unroll
+      for (int y = 0; y < 4; y++)
+#pragma unroll
+        for (int x = 0; x < 4; x++) board[hStart + y][wStart + x] = outEl[y][x];
+    }
+
+    // Add bias
+#pragma unroll
+  for (int y = 0; y < 8; y++)
+#pragma unroll
+    for (int x = 0; x < 8; x++)
+      if (use_bias) board[y][x] += b;
+
+  // Add skip connection, perform relu, and write to output.
+  for (int h = 0; h < 8; h++) {
+    // residual add
+    if (use_skip) {
+#pragma unroll
+      for (int w = 0; w < 8; w++) board[h][w] += skipInp[h][w];
+    }
+
+    // activation
+    if (activation != ACTIVATION_NONE) {
+#pragma unroll
+      for (int w = 0; w < 8; w++)
+        board[h][w] = (T)activate((float)board[h][w], activation);
+    }
+
+    // write un-transformed output to 'skip' if required
+    if (use_skip) {
+      // Write to skip (use 128 bit writes to store one row a time)
+      copyAs<sycl::uint4>(&skip[INDEX_NHCW(n, k, h, 0)], &board[h][0]);
+      if (!fp16)
+          copyAs<sycl::uint4>(&skip[INDEX_NHCW(n, k, h, 4)], &board[h][4]);
+    }
+  }
+
+  // perform input transform
+
+  int c = k;
+  // top-left
+  {
+    T inEl[6][6] = {};
+
+#pragma unroll
+    for (int i = 0; i < 5; i++)
+#pragma unroll
+      for (int j = 0; j < 5; j++) inEl[i + 1][j + 1] = board[i][j];
+
+    InputTransform4x4(&inEl[0][0], &inEl[0][0]);
+
+#pragma unroll
+    for (int y = 0; y < 6; y++)
+#pragma unroll
+      for (int x = 0; x < 6; x++)
+        output[TEMP_INDEX_HWNC(y, x, n * 4 + 0, c)] = inEl[y][x];
+  }
+
+  // top-right
+  {
+    T inEl[6][6] = {};
+
+#pragma unroll
+    for (int i = 0; i < 5; i++)
+#pragma unroll
+      for (int j = 0; j < 5; j++) inEl[i + 1][j] = board[i][j + 3];
+
+    InputTransform4x4(&inEl[0][0], &inEl[0][0]);
+
+#pragma unroll
+    for (int y = 0; y < 6; y++)
+#pragma unroll
+      for (int x = 0; x < 6; x++)
+        output[TEMP_INDEX_HWNC(y, x, n * 4 + 1, c)] = inEl[y][x];
+  }
+
+  // bottom-left
+  {
+    T inEl[6][6] = {};
+
+#pragma unroll
+    for (int i = 0; i < 5; i++)
+#pragma unroll
+      for (int j = 0; j < 5; j++) inEl[i][j + 1] = board[i + 3][j];
+
+    InputTransform4x4(&inEl[0][0], &inEl[0][0]);
+
+#pragma unroll
+    for (int y = 0; y < 6; y++)
+#pragma unroll
+      for (int x = 0; x < 6; x++)
+        output[TEMP_INDEX_HWNC(y, x, n * 4 + 2, c)] = inEl[y][x];
+  }
+
+  // bottom-right
+  {
+    T inEl[6][6] = {};
+
+#pragma unroll
+    for (int i = 0; i < 5; i++)
+#pragma unroll
+      for (int j = 0; j < 5; j++) inEl[i][j] = board[i + 3][j + 3];
+
+    InputTransform4x4(&inEl[0][0], &inEl[0][0]);
+
+#pragma unroll
+    for (int y = 0; y < 6; y++)
+#pragma unroll
+      for (int x = 0; x < 6; x++)
+        output[TEMP_INDEX_HWNC(y, x, n * 4 + 3, c)] = inEl[y][x];
+  }
+}
+
+template <typename T>
+void FilterTransform(int N, int C, T* transformedFilter, const T* filter, sycl::queue &mqueue) {
+  // Each thread processes entire filter block (input 3x3 elements -> output 6x6
+  // elements)
+  const int kBlockSize = 64;
+  const int kBlocks = DivUp(N * C, kBlockSize);
+
+  mqueue.parallel_for(
+      sycl::nd_range<3>(
+          sycl::range<3>(1, 1, kBlocks) * sycl::range<3>(1, 1, kBlockSize),
+          sycl::range<3>(1, 1, kBlockSize)),
+      [=](sycl::nd_item<3> item_ct1) {
+        filterTransform_kernel(N, C, N * C, transformedFilter, filter,
+                               item_ct1);
+      });
+}
+
+template <typename T, bool nhcw>
+void InputTransform(int N, int C, T* transformed_input, const T* input,
+                    sycl::queue &mqueue) {
+  // Each thread processes entire chess board (input 8x8 elements -> outputs
+  // 2x2, 6x6 elements)
+  /*
+  DPCT1049:8: The work-group size passed to the SYCL kernel may exceed the
+  limit. To get the device limit, query info::device::max_work_group_size.
+  Adjust the work-group size if needed.
+  */
+  {
+    
+    mqueue.parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
+                          sycl::range<3>(1, 1, C)),
+        [=](sycl::nd_item<3> item_ct1) {
+          InputTransform_kernel<T, nhcw>(N, C, input, transformed_input,
+                                         item_ct1);
+        });
+  }
+}
+
+template <typename T, bool use_se, ActivationFunction activation, bool use_bias,
+          bool use_skip, bool skipInput_nhcw, bool output_nhcw>
+void OutputTransform(int N, int C, int se_K, T* output, const T* input,
+                     const T* skip, const T* bias, const T* w1, const T* b1,
+                     const T* w2, const T* b2, sycl::queue &mqueue) {
+  // Each thread processes entire chess board
+  /*
+  DPCT1049:9: The work-group size passed to the SYCL kernel may exceed the
+  limit. To get the device limit, query info::device::max_work_group_size.
+  Adjust the work-group size if needed.
+  */
+  {
+    
+    mqueue.submit([&](sycl::handler& cgh) {
+      sycl::local_accessor<float, 1> shared_data_acc_ct1(sycl::range<1>(1024),
+                                                         cgh);
+
+      cgh.parallel_for(
+          sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
+                            sycl::range<3>(1, 1, C)),
+          [=](sycl::nd_item<3> item_ct1) {
+            OutputTransform_kernel<T, use_se, activation, use_bias, use_skip,
+                                   skipInput_nhcw, output_nhcw>(
+                N, C, se_K, output, input, skip, bias, w1, b1, w2, b2, item_ct1,
+                shared_data_acc_ct1.get_pointer());
+          });
+    });
+  }
+}
+
+}  // namespace sycldnn_backend
+}  // namespace lczero
diff --git a/src/neural/xla/network_xla.cc b/src/neural/backends/xla/network_xla.cc
similarity index 97%
rename from src/neural/xla/network_xla.cc
rename to src/neural/backends/xla/network_xla.cc
index 055105db88..336f03ef25 100644
--- a/src/neural/xla/network_xla.cc
+++ b/src/neural/backends/xla/network_xla.cc
@@ -27,11 +27,11 @@
 
 #include <cassert>
 
+#include "neural/backends/xla/xla_runner.h"
 #include "neural/factory.h"
 #include "neural/network.h"
 #include "neural/onnx/converter.h"
 #include "neural/xla/onnx2hlo.h"
-#include "neural/xla/xla_runner.h"
 #include "utils/bititer.h"
 
 namespace lczero {
@@ -81,10 +81,10 @@ class XlaNetwork : public Network {
     return std::make_unique<XlaComputation>(this);
   }
   int GetMiniBatchSize() const override {
-    // 32 is the default prefetch size, subtract it so that backend doesn't
-    // crash.
-    // TODO make it better when we have a proper way to query the batch size.
-    return runner_->GetMaxBatchSize() - 32;
+    return runner_->GetMaxBatchSize();
+  }
+  int GetPreferredBatchStep() const override {
+    return runner_->GetPreferredBatchStep();
   }
 
  private:
@@ -303,6 +303,8 @@ std::unique_ptr<Network> MakeXlaNetwork(const std::optional<WeightsFile>& w,
         WeightsToOnnxConverterOptions::StringToDataType(
             opts.GetOrDefault<std::string>("datatype", "f32"));
     onnx_converter_options.opset = 22;  // For full onnx bfloat16 support.
+    onnx_converter_options.alt_mish =
+        opts.GetOrDefault<bool>("alt_mish", false);
     auto converted = ConvertWeightsToOnnx(*w, onnx_converter_options);
     options = FillXlaRunnerFromOnnx(converted.onnx_model(), runner.get(),
                                     max_batch_size, steps, io_type);
diff --git a/src/neural/xla/pjrt.cc b/src/neural/backends/xla/pjrt.cc
similarity index 100%
rename from src/neural/xla/pjrt.cc
rename to src/neural/backends/xla/pjrt.cc
diff --git a/src/neural/xla/pjrt.h b/src/neural/backends/xla/pjrt.h
similarity index 100%
rename from src/neural/xla/pjrt.h
rename to src/neural/backends/xla/pjrt.h
diff --git a/src/neural/xla/xla_runner.cc b/src/neural/backends/xla/xla_runner.cc
similarity index 93%
rename from src/neural/xla/xla_runner.cc
rename to src/neural/backends/xla/xla_runner.cc
index eaef957535..0adac0dbd0 100644
--- a/src/neural/xla/xla_runner.cc
+++ b/src/neural/backends/xla/xla_runner.cc
@@ -25,7 +25,7 @@
   Program grant you additional permission to convey the resulting work.
 */
 
-#include "neural/xla/xla_runner.h"
+#include "neural/backends/xla/xla_runner.h"
 
 #include <algorithm>
 #include <numeric>
@@ -124,7 +124,16 @@ void XlaRunner::AddModule(size_t minibatch_size,
   pblczero::CompileOptionsProto options;
   options.mutable_executable_build_options()->set_num_replicas(1);
   options.mutable_executable_build_options()->set_num_partitions(1);
-  options.mutable_executable_build_options()->set_device_ordinal(device_);
+  options.mutable_executable_build_options()
+      ->mutable_device_assignment()
+      ->set_replica_count(1);
+  options.mutable_executable_build_options()
+      ->mutable_device_assignment()
+      ->set_computation_count(1);
+  options.mutable_executable_build_options()
+      ->mutable_device_assignment()
+      ->add_computation_devices()
+      ->add_replica_device_ids(device_);
   auto executable = pjrt_client_->CompileHlo(module.OutputAsString(),
                                              options.OutputAsString());
   executables_.push_back({minibatch_size, std::move(executable)});
@@ -161,6 +170,7 @@ void XlaRunner::SetFrozenInputs(
 }
 
 size_t XlaRunner::GetMaxBatchSize() const { return executables_.back().first; }
+size_t XlaRunner::GetPreferredBatchStep() const { return executables_.front().first; }
 
 std::vector<std::unique_ptr<XlaMutableTensor>> XlaRunner::ExecuteBlocking(
     const std::vector<XlaMutableTensor*>& inputs) {
@@ -189,7 +199,7 @@ std::vector<std::unique_ptr<XlaMutableTensor>> XlaRunner::ExecuteBlocking(
           ->HostToDevice(
               {static_cast<const char*>(inputs[0]->data()), inputs[0]->size()},
               XlaTypeToPjrtType(inputs[0]->type()), new_shape,
-              devices_[0].get())
+              devices_.at(device_).get())
           ->AwaitAndReleaseBuffer();
   // Make a copy to support multiple concurrent calls, not sure if it's needed.
   auto input_buffers = buffers_;
diff --git a/src/neural/xla/xla_runner.h b/src/neural/backends/xla/xla_runner.h
similarity index 96%
rename from src/neural/xla/xla_runner.h
rename to src/neural/backends/xla/xla_runner.h
index 62c4a27f14..4c8f71c374 100644
--- a/src/neural/xla/xla_runner.h
+++ b/src/neural/backends/xla/xla_runner.h
@@ -33,9 +33,9 @@
 #include <unordered_map>
 #include <vector>
 
-#include "neural/xla/hlo.pb.h"
-#include "neural/xla/pjrt.h"
+#include "neural/backends/xla/pjrt.h"
 #include "neural/xla/xla_tensor.h"
+#include "proto/hlo.pb.h"
 
 namespace lczero {
 
@@ -60,6 +60,7 @@ class XlaRunner {
   // Maximum supported batch size. It's expected that the capacity (not size) of
   // the input tensors would be able to fit this size.
   size_t GetMaxBatchSize() const;
+  size_t GetPreferredBatchStep() const;
 
  private:
   std::unique_ptr<PjrtClient> pjrt_client_;
@@ -76,4 +77,4 @@ class XlaRunner {
   int device_;
 };
 
-}  // namespace lczero
\ No newline at end of file
+}  // namespace lczero
diff --git a/src/neural/batchsplit.cc b/src/neural/batchsplit.cc
new file mode 100644
index 0000000000..00cee0b628
--- /dev/null
+++ b/src/neural/batchsplit.cc
@@ -0,0 +1,101 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2025 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include "neural/batchsplit.h"
+
+namespace lczero {
+namespace {
+
+class BatchSplittingBackend : public Backend {
+ public:
+  BatchSplittingBackend(Backend* wrapped) : wrapped_backend_(wrapped) {}
+
+  BackendAttributes GetAttributes() const override {
+    return wrapped_backend_->GetAttributes();
+  }
+  std::optional<EvalResult> GetCachedEvaluation(
+      const EvalPosition& pos) override {
+    return wrapped_backend_->GetCachedEvaluation(pos);
+  }
+  std::unique_ptr<BackendComputation> CreateComputation() override;
+
+  UpdateConfigurationResult UpdateConfiguration(
+      const OptionsDict& options) override {
+    return wrapped_backend_->UpdateConfiguration(options);
+  }
+
+  bool IsSameConfiguration(const OptionsDict& options) const override {
+    return wrapped_backend_->IsSameConfiguration(options);
+  }
+
+ private:
+  Backend* wrapped_backend_;
+};
+
+class BatchSplittingComputation : public BackendComputation {
+ public:
+  BatchSplittingComputation(Backend* wrapped_backend)
+      : wrapped_backend_(wrapped_backend),
+        max_batch_size_(wrapped_backend->GetAttributes().maximum_batch_size) {
+    MakeComputation();
+  }
+
+  size_t UsedBatchSize() const override {
+    return wrapped_computation_->UsedBatchSize();
+  }
+  AddInputResult AddInput(const EvalPosition& pos,
+                          EvalResultPtr result) override {
+    if (wrapped_computation_->UsedBatchSize() >= max_batch_size_) {
+      ComputeBlocking();
+      MakeComputation();
+    }
+    return wrapped_computation_->AddInput(pos, result);
+  }
+
+  void ComputeBlocking() override { wrapped_computation_->ComputeBlocking(); }
+
+ private:
+  void MakeComputation() {
+    wrapped_computation_ = wrapped_backend_->CreateComputation();
+  }
+
+  Backend* wrapped_backend_;
+  size_t max_batch_size_;
+  std::unique_ptr<BackendComputation> wrapped_computation_;
+};
+
+std::unique_ptr<BackendComputation> BatchSplittingBackend::CreateComputation() {
+  return std::make_unique<BatchSplittingComputation>(wrapped_backend_);
+}
+
+}  // namespace
+
+std::unique_ptr<Backend> CreateBatchSplitingBackend(Backend* parent) {
+  return std::make_unique<BatchSplittingBackend>(parent);
+}
+
+}  // namespace lczero
\ No newline at end of file
diff --git a/src/neural/batchsplit.h b/src/neural/batchsplit.h
new file mode 100644
index 0000000000..3766fe5886
--- /dev/null
+++ b/src/neural/batchsplit.h
@@ -0,0 +1,38 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2025 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#pragma once
+
+#include "neural/backend.h"
+
+namespace lczero {
+
+// Creates a backend wrapper that ensures that the maximum batch size of the
+// wrapped backend is respected.
+std::unique_ptr<Backend> CreateBatchSplitingBackend(Backend* parent);
+
+}  // namespace lczero
\ No newline at end of file
diff --git a/src/neural/cache.cc b/src/neural/cache.cc
deleted file mode 100644
index d729a562f0..0000000000
--- a/src/neural/cache.cc
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
-  This file is part of Leela Chess Zero.
-  Copyright (C) 2018 The LCZero Authors
-
-  Leela Chess is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  Leela Chess is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
-
-  Additional permission under GNU GPL version 3 section 7
-
-  If you modify this Program, or any covered work, by linking or
-  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
-  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
-  modified version of those libraries), containing parts covered by the
-  terms of the respective license agreement, the licensors of this
-  Program grant you additional permission to convey the resulting work.
-*/
-#include "neural/cache.h"
-#include <cassert>
-#include <iostream>
-
-namespace lczero {
-CachingComputation::CachingComputation(
-    std::unique_ptr<NetworkComputation> parent, NNCache* cache)
-    : parent_(std::move(parent)), cache_(cache) {}
-
-int CachingComputation::GetCacheMisses() const {
-  return parent_->GetBatchSize();
-}
-
-int CachingComputation::GetBatchSize() const { return batch_.size(); }
-
-bool CachingComputation::AddInputByHash(uint64_t hash) {
-  NNCacheLock lock(cache_, hash);
-  if (!lock) return false;
-  AddInputByHash(hash, std::move(lock));
-  return true;
-}
-
-void CachingComputation::AddInputByHash(uint64_t hash, NNCacheLock&& lock) {
-  assert(lock);
-  batch_.emplace_back();
-  batch_.back().lock = std::move(lock);
-  batch_.back().hash = hash;
-}
-
-void CachingComputation::PopCacheHit() {
-  assert(!batch_.empty());
-  assert(batch_.back().lock);
-  assert(batch_.back().idx_in_parent == -1);
-  batch_.pop_back();
-}
-
-void CachingComputation::AddInput(
-    uint64_t hash, InputPlanes&& input,
-    std::vector<uint16_t>&& probabilities_to_cache) {
-  if (AddInputByHash(hash)) return;
-  batch_.emplace_back();
-  batch_.back().hash = hash;
-  batch_.back().idx_in_parent = parent_->GetBatchSize();
-  batch_.back().probabilities_to_cache = probabilities_to_cache;
-  parent_->AddInput(std::move(input));
-}
-
-void CachingComputation::PopLastInputHit() {
-  assert(!batch_.empty());
-  assert(batch_.back().idx_in_parent == -1);
-  batch_.pop_back();
-}
-
-void CachingComputation::ComputeBlocking() {
-  if (parent_->GetBatchSize() == 0) return;
-  parent_->ComputeBlocking();
-
-  // Fill cache with data from NN.
-  for (const auto& item : batch_) {
-    if (item.idx_in_parent == -1) continue;
-    auto req =
-        std::make_unique<CachedNNRequest>(item.probabilities_to_cache.size());
-    req->q = parent_->GetQVal(item.idx_in_parent);
-    req->d = parent_->GetDVal(item.idx_in_parent);
-    req->m = parent_->GetMVal(item.idx_in_parent);
-    int idx = 0;
-    for (auto x : item.probabilities_to_cache) {
-      req->p[idx++] =
-          std::make_pair(x, parent_->GetPVal(item.idx_in_parent, x));
-    }
-    cache_->Insert(item.hash, std::move(req));
-  }
-}
-
-float CachingComputation::GetQVal(int sample) const {
-  const auto& item = batch_[sample];
-  if (item.idx_in_parent >= 0) return parent_->GetQVal(item.idx_in_parent);
-  return item.lock->q;
-}
-
-float CachingComputation::GetDVal(int sample) const {
-  const auto& item = batch_[sample];
-  if (item.idx_in_parent >= 0) return parent_->GetDVal(item.idx_in_parent);
-  return item.lock->d;
-}
-
-float CachingComputation::GetMVal(int sample) const {
-  const auto& item = batch_[sample];
-  if (item.idx_in_parent >= 0) return parent_->GetMVal(item.idx_in_parent);
-  return item.lock->m;
-}
-
-float CachingComputation::GetPVal(int sample, int move_id) const {
-  auto& item = batch_[sample];
-  if (item.idx_in_parent >= 0)
-    return parent_->GetPVal(item.idx_in_parent, move_id);
-  const auto& moves = item.lock->p;
-
-  int total_count = 0;
-  while (total_count < moves.size()) {
-    // Optimization: usually moves are stored in the same order as queried.
-    const auto& move = moves[item.last_idx++];
-    if (item.last_idx == moves.size()) item.last_idx = 0;
-    if (move.first == move_id) return move.second;
-    ++total_count;
-  }
-  assert(false);  // Move not found.
-  return 0;
-}
-
-}  // namespace lczero
diff --git a/src/neural/cache.h b/src/neural/cache.h
deleted file mode 100644
index 207e0fe6e4..0000000000
--- a/src/neural/cache.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
-  This file is part of Leela Chess Zero.
-  Copyright (C) 2018 The LCZero Authors
-
-  Leela Chess is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  Leela Chess is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
-
-  Additional permission under GNU GPL version 3 section 7
-
-  If you modify this Program, or any covered work, by linking or
-  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
-  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
-  modified version of those libraries), containing parts covered by the
-  terms of the respective license agreement, the licensors of this
-  Program grant you additional permission to convey the resulting work.
-*/
-#pragma once
-
-#include "neural/network.h"
-#include "utils/cache.h"
-#include "utils/smallarray.h"
-
-namespace lczero {
-
-struct CachedNNRequest {
-  CachedNNRequest(size_t size) : p(size) {}
-  typedef std::pair<uint16_t, float> IdxAndProb;
-  float q;
-  float d;
-  float m;
-  // TODO(mooskagh) Don't really need index if using perfect hash.
-  SmallArray<IdxAndProb> p;
-};
-
-typedef HashKeyedCache<CachedNNRequest> NNCache;
-typedef HashKeyedCacheLock<CachedNNRequest> NNCacheLock;
-
-// Wraps around NetworkComputation and caches result.
-// While it mostly repeats NetworkComputation interface, it's not derived
-// from it, as AddInput() needs hash and index of probabilities to store.
-class CachingComputation {
- public:
-  CachingComputation(std::unique_ptr<NetworkComputation> parent,
-                     NNCache* cache);
-
-  // How many inputs are not found in cache and will be forwarded to a wrapped
-  // computation.
-  int GetCacheMisses() const;
-  // Total number of times AddInput/AddInputByHash were (successfully) called.
-  int GetBatchSize() const;
-  // Adds input by hash only. If that hash is not in cache, returns false
-  // and does nothing. Otherwise adds.
-  bool AddInputByHash(uint64_t hash);
-  // Adds input by hash with existing lock. Assumes the given lock holds a real
-  // reference.
-  void AddInputByHash(uint64_t hash, NNCacheLock&& lock);
-  // Adds a sample to the batch.
-  // @hash is a hash to store/lookup it in the cache.
-  // @probabilities_to_cache is which indices of policy head to store.
-  void AddInput(uint64_t hash, InputPlanes&& input,
-                std::vector<uint16_t>&& probabilities_to_cache);
-  // Undos last AddInput. If it was a cache miss, the it's actually not removed
-  // from parent's batch.
-  void PopLastInputHit();
-  // Do the computation.
-  void ComputeBlocking();
-  // Returns Q value of @sample.
-  float GetQVal(int sample) const;
-  // Returns probability of draw if NN has WDL value head.
-  float GetDVal(int sample) const;
-  // Returns estimated remaining moves.
-  float GetMVal(int sample) const;
-  // Returns P value @move_id of @sample.
-  float GetPVal(int sample, int move_id) const;
-  // Pops last input from the computation. Only allowed for inputs which were
-  // cached.
-  void PopCacheHit();
-
-  // Can be used to avoid repeated reallocations internally while adding itemms.
-  void Reserve(int batch_size) { batch_.reserve(batch_size); }
-
- private:
-  struct WorkItem {
-    uint64_t hash;
-    NNCacheLock lock;
-    int idx_in_parent = -1;
-    std::vector<uint16_t> probabilities_to_cache;
-    mutable int last_idx = 0;
-  };
-
-  std::unique_ptr<NetworkComputation> parent_;
-  NNCache* cache_;
-  std::vector<WorkItem> batch_;
-};
-
-}  // namespace lczero
diff --git a/src/neural/cuda/inputs_outputs.h b/src/neural/cuda/inputs_outputs.h
deleted file mode 100644
index 4c356994a8..0000000000
--- a/src/neural/cuda/inputs_outputs.h
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
-  This file is part of Leela Chess Zero.
-  Copyright (C) 2018 The LCZero Authors
-
-  Leela Chess is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  Leela Chess is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
-
-  Additional permission under GNU GPL version 3 section 7
-
-  If you modify this Program, or any covered work, by linking or
-  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
-  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
-  modified version of those libraries), containing parts covered by the
-  terms of the respective license agreement, the licensors of this
-  Program grant you additional permission to convey the resulting work.
-*/
-
-#pragma once
-
-#include "neural/network.h"
-
-namespace lczero {
-namespace cudnn_backend {
-
-struct InputsOutputs {
-  InputsOutputs(int maxBatchSize, bool wdl, bool moves_left,
-                size_t tensor_mem_size = 0, size_t scratch_size = 0,
-                bool cublasDisableTensorCores = false) {
-    ReportCUDAErrors(cudaHostAlloc(
-        &input_masks_mem_, maxBatchSize * kInputPlanes * sizeof(uint64_t),
-        cudaHostAllocMapped));
-    ReportCUDAErrors(
-        cudaHostGetDevicePointer(&input_masks_mem_gpu_, input_masks_mem_, 0));
-
-    ReportCUDAErrors(cudaHostAlloc(&input_val_mem_,
-                                   maxBatchSize * kInputPlanes * sizeof(float),
-                                   cudaHostAllocMapped));
-    ReportCUDAErrors(
-        cudaHostGetDevicePointer(&input_val_mem_gpu_, input_val_mem_, 0));
-
-    ReportCUDAErrors(cudaHostAlloc(
-        &op_policy_mem_, maxBatchSize * kNumOutputPolicy * sizeof(float), 0));
-
-    // Seperate device memory copy for policy output.
-    // It's faster to write to device memory and then copy to host memory
-    // than having the kernel write directly to it.
-    ReportCUDAErrors(cudaMalloc(
-        &op_policy_mem_gpu_, maxBatchSize * kNumOutputPolicy * sizeof(float)));
-
-    ReportCUDAErrors(cudaHostAlloc(&op_value_mem_,
-                                   maxBatchSize * (wdl ? 3 : 1) * sizeof(float),
-                                   cudaHostAllocMapped));
-    ReportCUDAErrors(
-        cudaHostGetDevicePointer(&op_value_mem_gpu_, op_value_mem_, 0));
-    if (moves_left) {
-      ReportCUDAErrors(cudaHostAlloc(&op_moves_left_mem_,
-                                     maxBatchSize * sizeof(float),
-                                     cudaHostAllocMapped));
-      ReportCUDAErrors(cudaHostGetDevicePointer(&op_moves_left_mem_gpu_,
-                                                op_moves_left_mem_, 0));
-    }
-
-    // memory for network execution managed inside this structure
-    if (tensor_mem_size) {
-      multi_stream_ = true;
-      ReportCUDAErrors(cudaStreamCreate(&stream_));
-      ReportCUDAErrors(cudaMalloc(&scratch_mem_, scratch_size));
-      for (auto& mem : tensor_mem_) {
-        ReportCUDAErrors(cudaMalloc(&mem, tensor_mem_size));
-        ReportCUDAErrors(cudaMemsetAsync(mem, 0, tensor_mem_size, stream_));
-      }
-      ReportCUBLASErrors(cublasCreate(&cublas_));
-      ReportCUBLASErrors(cublasSetMathMode(
-          cublas_, cublasDisableTensorCores ? CUBLAS_PEDANTIC_MATH
-                                            : CUBLAS_TENSOR_OP_MATH));
-      ReportCUBLASErrors(cublasSetStream(cublas_, stream_));
-    } else {
-      multi_stream_ = false;
-    }
-  }
-  ~InputsOutputs() {
-    ReportCUDAErrors(cudaFreeHost(input_masks_mem_));
-    ReportCUDAErrors(cudaFreeHost(input_val_mem_));
-    ReportCUDAErrors(cudaFreeHost(op_policy_mem_));
-    ReportCUDAErrors(cudaFree(op_policy_mem_gpu_));
-    ReportCUDAErrors(cudaFreeHost(op_value_mem_));
-    if (op_moves_left_mem_ != nullptr)
-      ReportCUDAErrors(cudaFreeHost(op_moves_left_mem_));
-
-    if (multi_stream_) {
-      for (auto mem : tensor_mem_) {
-        if (mem) ReportCUDAErrors(cudaFree(mem));
-      }
-      if (scratch_mem_) ReportCUDAErrors(cudaFree(scratch_mem_));
-      if (offset_pointers_) ReportCUDAErrors(cudaFree(offset_pointers_));
-      if (head_offset_pointers_) {
-        ReportCUDAErrors(cudaFree(head_offset_pointers_));
-      }
-      cudaStreamDestroy(stream_);
-      cublasDestroy(cublas_);
-    }
-  }
-  uint64_t* input_masks_mem_;
-  float* input_val_mem_;
-  float* op_policy_mem_;
-  float* op_value_mem_;
-  float* op_moves_left_mem_ = nullptr;
-
-  // GPU pointers for the above allocations.
-  uint64_t* input_masks_mem_gpu_;
-  float* input_val_mem_gpu_;
-  float* op_value_mem_gpu_;
-  float* op_moves_left_mem_gpu_;
-
-  // This is a seperate copy.
-  float* op_policy_mem_gpu_;
-
-  // memory needed to run the network owned by InputsOutputs when multi_stream
-  // is enabled
-  bool multi_stream_;
-  void* tensor_mem_[3];
-  void* scratch_mem_;
-  void** offset_pointers_ = nullptr;
-  void** head_offset_pointers_ = nullptr;
-
-  // cuda stream used to run the network
-  cudaStream_t stream_;
-
-  // cublas handle used to run the network
-  cublasHandle_t cublas_;
-};
-
-}  // namespace cudnn_backend
-}  // namespace lczero
diff --git a/src/neural/decoder.cc b/src/neural/decoder.cc
index 34f78466bf..9f2be0472a 100644
--- a/src/neural/decoder.cc
+++ b/src/neural/decoder.cc
@@ -33,12 +33,12 @@ namespace lczero {
 
 namespace {
 
-BoardSquare SingleSquare(BitBoard input) {
+Square SingleSquare(BitBoard input) {
   for (auto sq : input) {
     return sq;
   }
   assert(false);
-  return BoardSquare();
+  return Square();
 }
 
 BitBoard MaskDiffWithMirror(const InputPlane& cur, const InputPlane& prev) {
@@ -47,7 +47,7 @@ BitBoard MaskDiffWithMirror(const InputPlane& cur, const InputPlane& prev) {
   return BitBoard(cur.mask ^ to_mirror.as_int());
 }
 
-BoardSquare OldPosition(const InputPlane& prev, BitBoard mask_diff) {
+Square OldPosition(const InputPlane& prev, BitBoard mask_diff) {
   auto to_mirror = BitBoard(prev.mask);
   to_mirror.Mirror();
   return SingleSquare(to_mirror & mask_diff);
@@ -95,34 +95,36 @@ void PopulateBoard(pblczero::NetworkFormat::InputFormat input_format,
     case pblczero::NetworkFormat::INPUT_112_WITH_CANONICALIZATION_V2:
     case pblczero::NetworkFormat::
         INPUT_112_WITH_CANONICALIZATION_V2_ARMAGEDDON: {
-      int our_queenside = ChessBoard::FILE_A;
-      int their_queenside = ChessBoard::FILE_A;
-      int our_kingside = ChessBoard::FILE_H;
-      int their_kingside = ChessBoard::FILE_H;
+      File our_queenside = kFileA;
+      File their_queenside = kFileA;
+      File our_kingside = kFileH;
+      File their_kingside = kFileH;
       if (planes[kAuxPlaneBase + 0].mask != 0) {
         auto mask = planes[kAuxPlaneBase + 0].mask;
         if ((mask & 0xFFLL) != 0) {
-          our_queenside = GetLowestBit(mask & 0xFFLL);
+          our_queenside = File::FromIdx(GetLowestBit(mask & 0xFFLL));
           castlings.set_we_can_000();
         }
         if (mask >> 56 != 0) {
-          their_queenside = GetLowestBit(mask >> 56);
+          their_queenside = File::FromIdx(GetLowestBit(mask >> 56));
           castlings.set_they_can_000();
         }
       }
       if (planes[kAuxPlaneBase + 1].mask != 0) {
         auto mask = planes[kAuxPlaneBase + 1].mask;
         if ((mask & 0xFFLL) != 0) {
-          our_kingside = GetLowestBit(mask & 0xFFLL);
+          our_kingside.idx = GetLowestBit(mask & 0xFFLL);
           castlings.set_we_can_00();
         }
         if (mask >> 56 != 0) {
-          their_kingside = GetLowestBit(mask >> 56);
+          their_kingside.idx = GetLowestBit(mask >> 56);
           castlings.set_they_can_00();
         }
       }
-      castlings.SetRookPositions(our_queenside, our_kingside, their_queenside,
-                                 their_kingside);
+      castlings.our_kingside_rook = our_kingside;
+      castlings.our_queenside_rook = our_queenside;
+      castlings.their_kingside_rook = their_kingside;
+      castlings.their_queenside_rook = their_queenside;
       break;
     }
 
@@ -161,29 +163,30 @@ void PopulateBoard(pblczero::NetworkFormat::InputFormat input_format,
     int emptycounter = 0;
     for (int col = 0; col < 8; ++col) {
       char piece = '\0';
-      if (pawnsOurs.get(row, col)) {
+      Square square(File::FromIdx(col), Rank::FromIdx(row));
+      if (pawnsOurs.get(square)) {
         piece = 'P';
-      } else if (pawnsTheirs.get(row, col)) {
+      } else if (pawnsTheirs.get(square)) {
         piece = 'p';
-      } else if (knightsOurs.get(row, col)) {
+      } else if (knightsOurs.get(square)) {
         piece = 'N';
-      } else if (knightsTheirs.get(row, col)) {
+      } else if (knightsTheirs.get(square)) {
         piece = 'n';
-      } else if (bishopOurs.get(row, col)) {
+      } else if (bishopOurs.get(square)) {
         piece = 'B';
-      } else if (bishopTheirs.get(row, col)) {
+      } else if (bishopTheirs.get(square)) {
         piece = 'b';
-      } else if (rookOurs.get(row, col)) {
+      } else if (rookOurs.get(square)) {
         piece = 'R';
-      } else if (rookTheirs.get(row, col)) {
+      } else if (rookTheirs.get(square)) {
         piece = 'r';
-      } else if (queenOurs.get(row, col)) {
+      } else if (queenOurs.get(square)) {
         piece = 'Q';
-      } else if (queenTheirs.get(row, col)) {
+      } else if (queenTheirs.get(square)) {
         piece = 'q';
-      } else if (kingOurs.get(row, col)) {
+      } else if (kingOurs.get(square)) {
         piece = 'K';
-      } else if (kingTheirs.get(row, col)) {
+      } else if (kingTheirs.get(square)) {
         piece = 'k';
       }
       if (emptycounter > 0 && piece) {
@@ -209,8 +212,9 @@ void PopulateBoard(pblczero::NetworkFormat::InputFormat input_format,
     if (planes[kAuxPlaneBase + 4].mask == 0) {
       fen += "-";
     } else {
-      int col = GetLowestBit(planes[kAuxPlaneBase + 4].mask >> 56);
-      fen += BoardSquare(5, col).as_string();
+      File file =
+          File::FromIdx(GetLowestBit(planes[kAuxPlaneBase + 4].mask >> 56));
+      fen += Square(file, kRank6).ToString();
     }
   } else {
     auto pawndiff = BitBoard(planes[6].mask ^ planes[kPlanesPerBoard + 6].mask);
@@ -220,15 +224,15 @@ void PopulateBoard(pblczero::NetworkFormat::InputFormat input_format,
       auto from =
           SingleSquare(planes[kPlanesPerBoard + 6].mask & pawndiff.as_int());
       auto to = SingleSquare(planes[6].mask & pawndiff.as_int());
-      if (from.col() != to.col() || std::abs(from.row() - to.row()) != 2) {
+      if (from.file() != to.file() || std::abs(from.rank() - to.rank()) != 2) {
         fen += "-";
       } else {
         // TODO: Ensure enpassant is legal rather than setting it blindly?
         // Doesn't matter for rescoring use case as only legal moves will be
         // performed afterwards.
-        fen +=
-            BoardSquare((planes[kAuxPlaneBase + 4].mask != 0) ? 2 : 5, to.col())
-                .as_string();
+        fen += Square(to.file(),
+                      (planes[kAuxPlaneBase + 4].mask != 0) ? kRank3 : kRank6)
+                   .ToString();
       }
     } else {
       fen += "-";
@@ -257,19 +261,19 @@ Move DecodeMoveFromInput(const InputPlanes& planes, const InputPlanes& prior) {
     auto from = SingleSquare(pawndiff);
     if (knightdiff.count() == 1) {
       auto to = SingleSquare(knightdiff);
-      return Move(from, to, Move::Promotion::Knight);
+      return Move::WhitePromotion(from, to, kKnight);
     }
     if (bishopdiff.count() == 1) {
       auto to = SingleSquare(bishopdiff);
-      return Move(from, to, Move::Promotion::Bishop);
+      return Move::WhitePromotion(from, to, kBishop);
     }
     if (rookdiff.count() == 1) {
       auto to = SingleSquare(rookdiff);
-      return Move(from, to, Move::Promotion::Rook);
+      return Move::WhitePromotion(from, to, kRook);
     }
     if (queendiff.count() == 1) {
       auto to = SingleSquare(queendiff);
-      return Move(from, to, Move::Promotion::Queen);
+      return Move::WhitePromotion(from, to, kQueen);
     }
     assert(false);
     return Move();
@@ -280,64 +284,79 @@ Move DecodeMoveFromInput(const InputPlanes& planes, const InputPlanes& prior) {
     if (rookdiff.count() == 2) {
       auto from = OldPosition(prior[5], kingdiff);
       auto to = OldPosition(prior[3], rookdiff);
-      return Move(from, to);
+      Move m = Move::WhiteCastling(from.file(), to.file());
+      if (from.rank() == kRank8) m.Flip();
+      return m;
     }
     auto from = OldPosition(prior[5], kingdiff);
     auto to = SingleSquare(planes[11].mask & kingdiff.as_int());
-    if (std::abs(from.col() - to.col()) > 1) {
+    if (std::abs(from.file() - to.file()) > 1) {
       // Chess 960 castling can leave the rook in place, but the king has moved
       // from one side of the rook to the other - thus has gone at least 2
       // squares, which is impossible for a normal king move. Can't work out the
       // rook location from rookdiff since its empty, but it is known given the
       // direction of the king movement and the knowledge that the rook hasn't
       // moved.
-      if (from.col() > to.col()) {
-        to = BoardSquare(from.row(), to.col() + 1);
+      if (from.file() > to.file()) {
+        to = Square(to.file() + 1, from.rank());
       } else {
-        to = BoardSquare(from.row(), to.col() - 1);
+        to = Square(to.file() - 1, from.rank());
       }
+      Move m = Move::WhiteCastling(from.file(), to.file());
+      if (from.rank() == kRank8) m.Flip();
+      return m;
     }
-    return Move(from, to);
+    return Move::White(from, to);
   }
   if (queendiff.count() == 2) {
     auto from = OldPosition(prior[4], queendiff);
     auto to = SingleSquare(planes[10].mask & queendiff.as_int());
-    return Move(from, to);
+    return Move::White(from, to);
   }
   if (rookdiff.count() == 2) {
     auto from = OldPosition(prior[3], rookdiff);
     auto to = SingleSquare(planes[9].mask & rookdiff.as_int());
     // Only one king, so we can simply grab its current location directly.
     auto kingpos = SingleSquare(planes[11].mask);
-    if (from.row() == kingpos.row() && to.row() == kingpos.row() &&
-        ((from.col() < kingpos.col() && to.col() > kingpos.col()) ||
-         (from.col() > kingpos.col() && to.col() < kingpos.col()))) {
+    if (from.rank() == kingpos.rank() && to.rank() == kingpos.rank() &&
+        ((from.file() < kingpos.file() && to.file() > kingpos.file()) ||
+         (from.file() > kingpos.file() && to.file() < kingpos.file()))) {
       // If the king hasn't moved, this could still be a chess 960 castling move
       // if the rook has passed through the king.
       // Destination of the castling move is where the rook started.
       to = from;
       // And since the king didn't move it forms the start position.
       from = kingpos;
+      Move m = Move::WhiteCastling(from.file(), to.file());
+      if (from.rank() == kRank8) m.Flip();
+      return m;
     }
-    return Move(from, to);
+    return Move::White(from, to);
   }
   if (bishopdiff.count() == 2) {
     auto from = OldPosition(prior[2], bishopdiff);
     auto to = SingleSquare(planes[8].mask & bishopdiff.as_int());
-    return Move(from, to);
+    return Move::White(from, to);
   }
   if (knightdiff.count() == 2) {
     auto from = OldPosition(prior[1], knightdiff);
     auto to = SingleSquare(planes[7].mask & knightdiff.as_int());
-    return Move(from, to);
+    return Move::White(from, to);
   }
   if (pawndiff.count() == 2) {
     auto from = OldPosition(prior[0], pawndiff);
     auto to = SingleSquare(planes[6].mask & pawndiff.as_int());
-    return Move(from, to);
+    // Check for enpassant.
+    auto targets = BitBoard(prior[6].mask | prior[7].mask | prior[8].mask |
+                            prior[9].mask | prior[10].mask);
+    targets.Mirror();
+    if (from.file() != to.file() && (targets & pawndiff) == 0) {
+      return Move::WhiteEnPassant(from, to);
+    }
+    return Move::White(from, to);
   }
   assert(false);
-  return Move();
+  throw Exception("Invalid move encoding");
 }
 
 }  // namespace lczero
diff --git a/src/neural/encoder.cc b/src/neural/encoder.cc
index b459aec75b..fe412a9ffa 100644
--- a/src/neural/encoder.cc
+++ b/src/neural/encoder.cc
@@ -133,7 +133,7 @@ int TransformForPosition(pblczero::NetworkFormat::InputFormat input_format,
 
 InputPlanes EncodePositionForNN(
     pblczero::NetworkFormat::InputFormat input_format,
-    const PositionHistory& history, int history_planes,
+    std::span<const Position> history, int history_planes,
     FillEmptyHistory fill_empty_history, int* transform_out) {
   InputPlanes result(kAuxPlaneBase + 8);
 
@@ -146,7 +146,7 @@ InputPlanes EncodePositionForNN(
   // it for the first board.
   ChessBoard::Castlings castlings;
   {
-    const ChessBoard& board = history.Last().GetBoard();
+    const ChessBoard& board = history.back().GetBoard();
     const bool we_are_black = board.flipped();
     if (IsCanonicalFormat(input_format)) {
       transform = ChooseTransform(board);
@@ -183,22 +183,23 @@ InputPlanes EncodePositionForNN(
         const auto& cast = board.castlings();
         result[kAuxPlaneBase + 0].mask =
             (cast.we_can_000()
-                 ? BoardSquare(ChessBoard::A1 + cast.our_queenside_rook())
-                       .as_board()
+                 ? BitBoard::FromSquare(Square(cast.our_queenside_rook, kRank1))
+                       .as_int()
                  : 0) |
             (cast.they_can_000()
-                 ? BoardSquare(ChessBoard::A8 + cast.their_queenside_rook())
-                       .as_board()
+                 ? BitBoard::FromSquare(
+                       Square(cast.their_queenside_rook, kRank8))
+                       .as_int()
                  : 0);
         result[kAuxPlaneBase + 1].mask =
             (cast.we_can_00()
-                 ? BoardSquare(ChessBoard::A1 + cast.our_kingside_rook())
-                       .as_board()
+                 ? BitBoard::FromSquare(Square(cast.our_kingside_rook, kRank1))
+                       .as_int()
                  : 0) |
-            (cast.they_can_00()
-                 ? BoardSquare(ChessBoard::A8 + cast.their_kingside_rook())
-                       .as_board()
-                 : 0);
+            (cast.they_can_00() ? BitBoard::FromSquare(
+                                      Square(cast.their_kingside_rook, kRank8))
+                                      .as_int()
+                                : 0);
         break;
       }
       default:
@@ -211,9 +212,9 @@ InputPlanes EncodePositionForNN(
       if (we_are_black) result[kAuxPlaneBase + 4].SetAll();
     }
     if (IsHectopliesFormat(input_format)) {
-      result[kAuxPlaneBase + 5].Fill(history.Last().GetRule50Ply() / 100.0f);
+      result[kAuxPlaneBase + 5].Fill(history.back().GetRule50Ply() / 100.0f);
     } else {
-      result[kAuxPlaneBase + 5].Fill(history.Last().GetRule50Ply());
+      result[kAuxPlaneBase + 5].Fill(history.back().GetRule50Ply());
     }
     // Plane kAuxPlaneBase + 6 used to be movecount plane, now it's all zeros
     // unless we need it for canonical armageddon side to move.
@@ -232,18 +233,17 @@ InputPlanes EncodePositionForNN(
       input_format == pblczero::NetworkFormat::
                           INPUT_112_WITH_CANONICALIZATION_V2_ARMAGEDDON;
   bool flip = false;
-  int history_idx = history.GetLength() - 1;
+  int history_idx = history.size() - 1;
   for (int i = 0; i < std::min(history_planes, kMoveHistory);
        ++i, --history_idx) {
-    const Position& position =
-        history.GetPositionAt(history_idx < 0 ? 0 : history_idx);
-    const ChessBoard& board =
-        flip ? position.GetThemBoard() : position.GetBoard();
+    const Position& position = history[history_idx < 0 ? 0 : history_idx];
+    ChessBoard board = position.GetBoard();
+    if (flip) board.Mirror();
     // Castling changes can't be repeated, so we can stop early.
     if (stop_early && board.castlings().as_int() != castlings.as_int()) break;
     // Enpassants can't be repeated, but we do need to always send the current
     // position.
-    if (stop_early && history_idx != history.GetLength() - 1 &&
+    if (stop_early && history_idx != static_cast<int>(history.size()) - 1 &&
         !board.en_passant().empty()) {
       break;
     }
@@ -327,4 +327,316 @@ InputPlanes EncodePositionForNN(
   return result;
 }
 
+InputPlanes EncodePositionForNN(
+    pblczero::NetworkFormat::InputFormat input_format,
+    const PositionHistory& history, int history_planes,
+    FillEmptyHistory fill_empty_history, int* transform_out) {
+  return EncodePositionForNN(input_format, history.GetPositions(),
+                             history_planes, fill_empty_history, transform_out);
+}
+
+namespace {
+const char* kMoveStrs[] = {
+    "a1b1",  "a1c1",  "a1d1",  "a1e1",  "a1f1",  "a1g1",  "a1h1",  "a1a2",
+    "a1b2",  "a1c2",  "a1a3",  "a1b3",  "a1c3",  "a1a4",  "a1d4",  "a1a5",
+    "a1e5",  "a1a6",  "a1f6",  "a1a7",  "a1g7",  "a1a8",  "a1h8",  "b1a1",
+    "b1c1",  "b1d1",  "b1e1",  "b1f1",  "b1g1",  "b1h1",  "b1a2",  "b1b2",
+    "b1c2",  "b1d2",  "b1a3",  "b1b3",  "b1c3",  "b1d3",  "b1b4",  "b1e4",
+    "b1b5",  "b1f5",  "b1b6",  "b1g6",  "b1b7",  "b1h7",  "b1b8",  "c1a1",
+    "c1b1",  "c1d1",  "c1e1",  "c1f1",  "c1g1",  "c1h1",  "c1a2",  "c1b2",
+    "c1c2",  "c1d2",  "c1e2",  "c1a3",  "c1b3",  "c1c3",  "c1d3",  "c1e3",
+    "c1c4",  "c1f4",  "c1c5",  "c1g5",  "c1c6",  "c1h6",  "c1c7",  "c1c8",
+    "d1a1",  "d1b1",  "d1c1",  "d1e1",  "d1f1",  "d1g1",  "d1h1",  "d1b2",
+    "d1c2",  "d1d2",  "d1e2",  "d1f2",  "d1b3",  "d1c3",  "d1d3",  "d1e3",
+    "d1f3",  "d1a4",  "d1d4",  "d1g4",  "d1d5",  "d1h5",  "d1d6",  "d1d7",
+    "d1d8",  "e1a1",  "e1b1",  "e1c1",  "e1d1",  "e1f1",  "e1g1",  "e1h1",
+    "e1c2",  "e1d2",  "e1e2",  "e1f2",  "e1g2",  "e1c3",  "e1d3",  "e1e3",
+    "e1f3",  "e1g3",  "e1b4",  "e1e4",  "e1h4",  "e1a5",  "e1e5",  "e1e6",
+    "e1e7",  "e1e8",  "f1a1",  "f1b1",  "f1c1",  "f1d1",  "f1e1",  "f1g1",
+    "f1h1",  "f1d2",  "f1e2",  "f1f2",  "f1g2",  "f1h2",  "f1d3",  "f1e3",
+    "f1f3",  "f1g3",  "f1h3",  "f1c4",  "f1f4",  "f1b5",  "f1f5",  "f1a6",
+    "f1f6",  "f1f7",  "f1f8",  "g1a1",  "g1b1",  "g1c1",  "g1d1",  "g1e1",
+    "g1f1",  "g1h1",  "g1e2",  "g1f2",  "g1g2",  "g1h2",  "g1e3",  "g1f3",
+    "g1g3",  "g1h3",  "g1d4",  "g1g4",  "g1c5",  "g1g5",  "g1b6",  "g1g6",
+    "g1a7",  "g1g7",  "g1g8",  "h1a1",  "h1b1",  "h1c1",  "h1d1",  "h1e1",
+    "h1f1",  "h1g1",  "h1f2",  "h1g2",  "h1h2",  "h1f3",  "h1g3",  "h1h3",
+    "h1e4",  "h1h4",  "h1d5",  "h1h5",  "h1c6",  "h1h6",  "h1b7",  "h1h7",
+    "h1a8",  "h1h8",  "a2a1",  "a2b1",  "a2c1",  "a2b2",  "a2c2",  "a2d2",
+    "a2e2",  "a2f2",  "a2g2",  "a2h2",  "a2a3",  "a2b3",  "a2c3",  "a2a4",
+    "a2b4",  "a2c4",  "a2a5",  "a2d5",  "a2a6",  "a2e6",  "a2a7",  "a2f7",
+    "a2a8",  "a2g8",  "b2a1",  "b2b1",  "b2c1",  "b2d1",  "b2a2",  "b2c2",
+    "b2d2",  "b2e2",  "b2f2",  "b2g2",  "b2h2",  "b2a3",  "b2b3",  "b2c3",
+    "b2d3",  "b2a4",  "b2b4",  "b2c4",  "b2d4",  "b2b5",  "b2e5",  "b2b6",
+    "b2f6",  "b2b7",  "b2g7",  "b2b8",  "b2h8",  "c2a1",  "c2b1",  "c2c1",
+    "c2d1",  "c2e1",  "c2a2",  "c2b2",  "c2d2",  "c2e2",  "c2f2",  "c2g2",
+    "c2h2",  "c2a3",  "c2b3",  "c2c3",  "c2d3",  "c2e3",  "c2a4",  "c2b4",
+    "c2c4",  "c2d4",  "c2e4",  "c2c5",  "c2f5",  "c2c6",  "c2g6",  "c2c7",
+    "c2h7",  "c2c8",  "d2b1",  "d2c1",  "d2d1",  "d2e1",  "d2f1",  "d2a2",
+    "d2b2",  "d2c2",  "d2e2",  "d2f2",  "d2g2",  "d2h2",  "d2b3",  "d2c3",
+    "d2d3",  "d2e3",  "d2f3",  "d2b4",  "d2c4",  "d2d4",  "d2e4",  "d2f4",
+    "d2a5",  "d2d5",  "d2g5",  "d2d6",  "d2h6",  "d2d7",  "d2d8",  "e2c1",
+    "e2d1",  "e2e1",  "e2f1",  "e2g1",  "e2a2",  "e2b2",  "e2c2",  "e2d2",
+    "e2f2",  "e2g2",  "e2h2",  "e2c3",  "e2d3",  "e2e3",  "e2f3",  "e2g3",
+    "e2c4",  "e2d4",  "e2e4",  "e2f4",  "e2g4",  "e2b5",  "e2e5",  "e2h5",
+    "e2a6",  "e2e6",  "e2e7",  "e2e8",  "f2d1",  "f2e1",  "f2f1",  "f2g1",
+    "f2h1",  "f2a2",  "f2b2",  "f2c2",  "f2d2",  "f2e2",  "f2g2",  "f2h2",
+    "f2d3",  "f2e3",  "f2f3",  "f2g3",  "f2h3",  "f2d4",  "f2e4",  "f2f4",
+    "f2g4",  "f2h4",  "f2c5",  "f2f5",  "f2b6",  "f2f6",  "f2a7",  "f2f7",
+    "f2f8",  "g2e1",  "g2f1",  "g2g1",  "g2h1",  "g2a2",  "g2b2",  "g2c2",
+    "g2d2",  "g2e2",  "g2f2",  "g2h2",  "g2e3",  "g2f3",  "g2g3",  "g2h3",
+    "g2e4",  "g2f4",  "g2g4",  "g2h4",  "g2d5",  "g2g5",  "g2c6",  "g2g6",
+    "g2b7",  "g2g7",  "g2a8",  "g2g8",  "h2f1",  "h2g1",  "h2h1",  "h2a2",
+    "h2b2",  "h2c2",  "h2d2",  "h2e2",  "h2f2",  "h2g2",  "h2f3",  "h2g3",
+    "h2h3",  "h2f4",  "h2g4",  "h2h4",  "h2e5",  "h2h5",  "h2d6",  "h2h6",
+    "h2c7",  "h2h7",  "h2b8",  "h2h8",  "a3a1",  "a3b1",  "a3c1",  "a3a2",
+    "a3b2",  "a3c2",  "a3b3",  "a3c3",  "a3d3",  "a3e3",  "a3f3",  "a3g3",
+    "a3h3",  "a3a4",  "a3b4",  "a3c4",  "a3a5",  "a3b5",  "a3c5",  "a3a6",
+    "a3d6",  "a3a7",  "a3e7",  "a3a8",  "a3f8",  "b3a1",  "b3b1",  "b3c1",
+    "b3d1",  "b3a2",  "b3b2",  "b3c2",  "b3d2",  "b3a3",  "b3c3",  "b3d3",
+    "b3e3",  "b3f3",  "b3g3",  "b3h3",  "b3a4",  "b3b4",  "b3c4",  "b3d4",
+    "b3a5",  "b3b5",  "b3c5",  "b3d5",  "b3b6",  "b3e6",  "b3b7",  "b3f7",
+    "b3b8",  "b3g8",  "c3a1",  "c3b1",  "c3c1",  "c3d1",  "c3e1",  "c3a2",
+    "c3b2",  "c3c2",  "c3d2",  "c3e2",  "c3a3",  "c3b3",  "c3d3",  "c3e3",
+    "c3f3",  "c3g3",  "c3h3",  "c3a4",  "c3b4",  "c3c4",  "c3d4",  "c3e4",
+    "c3a5",  "c3b5",  "c3c5",  "c3d5",  "c3e5",  "c3c6",  "c3f6",  "c3c7",
+    "c3g7",  "c3c8",  "c3h8",  "d3b1",  "d3c1",  "d3d1",  "d3e1",  "d3f1",
+    "d3b2",  "d3c2",  "d3d2",  "d3e2",  "d3f2",  "d3a3",  "d3b3",  "d3c3",
+    "d3e3",  "d3f3",  "d3g3",  "d3h3",  "d3b4",  "d3c4",  "d3d4",  "d3e4",
+    "d3f4",  "d3b5",  "d3c5",  "d3d5",  "d3e5",  "d3f5",  "d3a6",  "d3d6",
+    "d3g6",  "d3d7",  "d3h7",  "d3d8",  "e3c1",  "e3d1",  "e3e1",  "e3f1",
+    "e3g1",  "e3c2",  "e3d2",  "e3e2",  "e3f2",  "e3g2",  "e3a3",  "e3b3",
+    "e3c3",  "e3d3",  "e3f3",  "e3g3",  "e3h3",  "e3c4",  "e3d4",  "e3e4",
+    "e3f4",  "e3g4",  "e3c5",  "e3d5",  "e3e5",  "e3f5",  "e3g5",  "e3b6",
+    "e3e6",  "e3h6",  "e3a7",  "e3e7",  "e3e8",  "f3d1",  "f3e1",  "f3f1",
+    "f3g1",  "f3h1",  "f3d2",  "f3e2",  "f3f2",  "f3g2",  "f3h2",  "f3a3",
+    "f3b3",  "f3c3",  "f3d3",  "f3e3",  "f3g3",  "f3h3",  "f3d4",  "f3e4",
+    "f3f4",  "f3g4",  "f3h4",  "f3d5",  "f3e5",  "f3f5",  "f3g5",  "f3h5",
+    "f3c6",  "f3f6",  "f3b7",  "f3f7",  "f3a8",  "f3f8",  "g3e1",  "g3f1",
+    "g3g1",  "g3h1",  "g3e2",  "g3f2",  "g3g2",  "g3h2",  "g3a3",  "g3b3",
+    "g3c3",  "g3d3",  "g3e3",  "g3f3",  "g3h3",  "g3e4",  "g3f4",  "g3g4",
+    "g3h4",  "g3e5",  "g3f5",  "g3g5",  "g3h5",  "g3d6",  "g3g6",  "g3c7",
+    "g3g7",  "g3b8",  "g3g8",  "h3f1",  "h3g1",  "h3h1",  "h3f2",  "h3g2",
+    "h3h2",  "h3a3",  "h3b3",  "h3c3",  "h3d3",  "h3e3",  "h3f3",  "h3g3",
+    "h3f4",  "h3g4",  "h3h4",  "h3f5",  "h3g5",  "h3h5",  "h3e6",  "h3h6",
+    "h3d7",  "h3h7",  "h3c8",  "h3h8",  "a4a1",  "a4d1",  "a4a2",  "a4b2",
+    "a4c2",  "a4a3",  "a4b3",  "a4c3",  "a4b4",  "a4c4",  "a4d4",  "a4e4",
+    "a4f4",  "a4g4",  "a4h4",  "a4a5",  "a4b5",  "a4c5",  "a4a6",  "a4b6",
+    "a4c6",  "a4a7",  "a4d7",  "a4a8",  "a4e8",  "b4b1",  "b4e1",  "b4a2",
+    "b4b2",  "b4c2",  "b4d2",  "b4a3",  "b4b3",  "b4c3",  "b4d3",  "b4a4",
+    "b4c4",  "b4d4",  "b4e4",  "b4f4",  "b4g4",  "b4h4",  "b4a5",  "b4b5",
+    "b4c5",  "b4d5",  "b4a6",  "b4b6",  "b4c6",  "b4d6",  "b4b7",  "b4e7",
+    "b4b8",  "b4f8",  "c4c1",  "c4f1",  "c4a2",  "c4b2",  "c4c2",  "c4d2",
+    "c4e2",  "c4a3",  "c4b3",  "c4c3",  "c4d3",  "c4e3",  "c4a4",  "c4b4",
+    "c4d4",  "c4e4",  "c4f4",  "c4g4",  "c4h4",  "c4a5",  "c4b5",  "c4c5",
+    "c4d5",  "c4e5",  "c4a6",  "c4b6",  "c4c6",  "c4d6",  "c4e6",  "c4c7",
+    "c4f7",  "c4c8",  "c4g8",  "d4a1",  "d4d1",  "d4g1",  "d4b2",  "d4c2",
+    "d4d2",  "d4e2",  "d4f2",  "d4b3",  "d4c3",  "d4d3",  "d4e3",  "d4f3",
+    "d4a4",  "d4b4",  "d4c4",  "d4e4",  "d4f4",  "d4g4",  "d4h4",  "d4b5",
+    "d4c5",  "d4d5",  "d4e5",  "d4f5",  "d4b6",  "d4c6",  "d4d6",  "d4e6",
+    "d4f6",  "d4a7",  "d4d7",  "d4g7",  "d4d8",  "d4h8",  "e4b1",  "e4e1",
+    "e4h1",  "e4c2",  "e4d2",  "e4e2",  "e4f2",  "e4g2",  "e4c3",  "e4d3",
+    "e4e3",  "e4f3",  "e4g3",  "e4a4",  "e4b4",  "e4c4",  "e4d4",  "e4f4",
+    "e4g4",  "e4h4",  "e4c5",  "e4d5",  "e4e5",  "e4f5",  "e4g5",  "e4c6",
+    "e4d6",  "e4e6",  "e4f6",  "e4g6",  "e4b7",  "e4e7",  "e4h7",  "e4a8",
+    "e4e8",  "f4c1",  "f4f1",  "f4d2",  "f4e2",  "f4f2",  "f4g2",  "f4h2",
+    "f4d3",  "f4e3",  "f4f3",  "f4g3",  "f4h3",  "f4a4",  "f4b4",  "f4c4",
+    "f4d4",  "f4e4",  "f4g4",  "f4h4",  "f4d5",  "f4e5",  "f4f5",  "f4g5",
+    "f4h5",  "f4d6",  "f4e6",  "f4f6",  "f4g6",  "f4h6",  "f4c7",  "f4f7",
+    "f4b8",  "f4f8",  "g4d1",  "g4g1",  "g4e2",  "g4f2",  "g4g2",  "g4h2",
+    "g4e3",  "g4f3",  "g4g3",  "g4h3",  "g4a4",  "g4b4",  "g4c4",  "g4d4",
+    "g4e4",  "g4f4",  "g4h4",  "g4e5",  "g4f5",  "g4g5",  "g4h5",  "g4e6",
+    "g4f6",  "g4g6",  "g4h6",  "g4d7",  "g4g7",  "g4c8",  "g4g8",  "h4e1",
+    "h4h1",  "h4f2",  "h4g2",  "h4h2",  "h4f3",  "h4g3",  "h4h3",  "h4a4",
+    "h4b4",  "h4c4",  "h4d4",  "h4e4",  "h4f4",  "h4g4",  "h4f5",  "h4g5",
+    "h4h5",  "h4f6",  "h4g6",  "h4h6",  "h4e7",  "h4h7",  "h4d8",  "h4h8",
+    "a5a1",  "a5e1",  "a5a2",  "a5d2",  "a5a3",  "a5b3",  "a5c3",  "a5a4",
+    "a5b4",  "a5c4",  "a5b5",  "a5c5",  "a5d5",  "a5e5",  "a5f5",  "a5g5",
+    "a5h5",  "a5a6",  "a5b6",  "a5c6",  "a5a7",  "a5b7",  "a5c7",  "a5a8",
+    "a5d8",  "b5b1",  "b5f1",  "b5b2",  "b5e2",  "b5a3",  "b5b3",  "b5c3",
+    "b5d3",  "b5a4",  "b5b4",  "b5c4",  "b5d4",  "b5a5",  "b5c5",  "b5d5",
+    "b5e5",  "b5f5",  "b5g5",  "b5h5",  "b5a6",  "b5b6",  "b5c6",  "b5d6",
+    "b5a7",  "b5b7",  "b5c7",  "b5d7",  "b5b8",  "b5e8",  "c5c1",  "c5g1",
+    "c5c2",  "c5f2",  "c5a3",  "c5b3",  "c5c3",  "c5d3",  "c5e3",  "c5a4",
+    "c5b4",  "c5c4",  "c5d4",  "c5e4",  "c5a5",  "c5b5",  "c5d5",  "c5e5",
+    "c5f5",  "c5g5",  "c5h5",  "c5a6",  "c5b6",  "c5c6",  "c5d6",  "c5e6",
+    "c5a7",  "c5b7",  "c5c7",  "c5d7",  "c5e7",  "c5c8",  "c5f8",  "d5d1",
+    "d5h1",  "d5a2",  "d5d2",  "d5g2",  "d5b3",  "d5c3",  "d5d3",  "d5e3",
+    "d5f3",  "d5b4",  "d5c4",  "d5d4",  "d5e4",  "d5f4",  "d5a5",  "d5b5",
+    "d5c5",  "d5e5",  "d5f5",  "d5g5",  "d5h5",  "d5b6",  "d5c6",  "d5d6",
+    "d5e6",  "d5f6",  "d5b7",  "d5c7",  "d5d7",  "d5e7",  "d5f7",  "d5a8",
+    "d5d8",  "d5g8",  "e5a1",  "e5e1",  "e5b2",  "e5e2",  "e5h2",  "e5c3",
+    "e5d3",  "e5e3",  "e5f3",  "e5g3",  "e5c4",  "e5d4",  "e5e4",  "e5f4",
+    "e5g4",  "e5a5",  "e5b5",  "e5c5",  "e5d5",  "e5f5",  "e5g5",  "e5h5",
+    "e5c6",  "e5d6",  "e5e6",  "e5f6",  "e5g6",  "e5c7",  "e5d7",  "e5e7",
+    "e5f7",  "e5g7",  "e5b8",  "e5e8",  "e5h8",  "f5b1",  "f5f1",  "f5c2",
+    "f5f2",  "f5d3",  "f5e3",  "f5f3",  "f5g3",  "f5h3",  "f5d4",  "f5e4",
+    "f5f4",  "f5g4",  "f5h4",  "f5a5",  "f5b5",  "f5c5",  "f5d5",  "f5e5",
+    "f5g5",  "f5h5",  "f5d6",  "f5e6",  "f5f6",  "f5g6",  "f5h6",  "f5d7",
+    "f5e7",  "f5f7",  "f5g7",  "f5h7",  "f5c8",  "f5f8",  "g5c1",  "g5g1",
+    "g5d2",  "g5g2",  "g5e3",  "g5f3",  "g5g3",  "g5h3",  "g5e4",  "g5f4",
+    "g5g4",  "g5h4",  "g5a5",  "g5b5",  "g5c5",  "g5d5",  "g5e5",  "g5f5",
+    "g5h5",  "g5e6",  "g5f6",  "g5g6",  "g5h6",  "g5e7",  "g5f7",  "g5g7",
+    "g5h7",  "g5d8",  "g5g8",  "h5d1",  "h5h1",  "h5e2",  "h5h2",  "h5f3",
+    "h5g3",  "h5h3",  "h5f4",  "h5g4",  "h5h4",  "h5a5",  "h5b5",  "h5c5",
+    "h5d5",  "h5e5",  "h5f5",  "h5g5",  "h5f6",  "h5g6",  "h5h6",  "h5f7",
+    "h5g7",  "h5h7",  "h5e8",  "h5h8",  "a6a1",  "a6f1",  "a6a2",  "a6e2",
+    "a6a3",  "a6d3",  "a6a4",  "a6b4",  "a6c4",  "a6a5",  "a6b5",  "a6c5",
+    "a6b6",  "a6c6",  "a6d6",  "a6e6",  "a6f6",  "a6g6",  "a6h6",  "a6a7",
+    "a6b7",  "a6c7",  "a6a8",  "a6b8",  "a6c8",  "b6b1",  "b6g1",  "b6b2",
+    "b6f2",  "b6b3",  "b6e3",  "b6a4",  "b6b4",  "b6c4",  "b6d4",  "b6a5",
+    "b6b5",  "b6c5",  "b6d5",  "b6a6",  "b6c6",  "b6d6",  "b6e6",  "b6f6",
+    "b6g6",  "b6h6",  "b6a7",  "b6b7",  "b6c7",  "b6d7",  "b6a8",  "b6b8",
+    "b6c8",  "b6d8",  "c6c1",  "c6h1",  "c6c2",  "c6g2",  "c6c3",  "c6f3",
+    "c6a4",  "c6b4",  "c6c4",  "c6d4",  "c6e4",  "c6a5",  "c6b5",  "c6c5",
+    "c6d5",  "c6e5",  "c6a6",  "c6b6",  "c6d6",  "c6e6",  "c6f6",  "c6g6",
+    "c6h6",  "c6a7",  "c6b7",  "c6c7",  "c6d7",  "c6e7",  "c6a8",  "c6b8",
+    "c6c8",  "c6d8",  "c6e8",  "d6d1",  "d6d2",  "d6h2",  "d6a3",  "d6d3",
+    "d6g3",  "d6b4",  "d6c4",  "d6d4",  "d6e4",  "d6f4",  "d6b5",  "d6c5",
+    "d6d5",  "d6e5",  "d6f5",  "d6a6",  "d6b6",  "d6c6",  "d6e6",  "d6f6",
+    "d6g6",  "d6h6",  "d6b7",  "d6c7",  "d6d7",  "d6e7",  "d6f7",  "d6b8",
+    "d6c8",  "d6d8",  "d6e8",  "d6f8",  "e6e1",  "e6a2",  "e6e2",  "e6b3",
+    "e6e3",  "e6h3",  "e6c4",  "e6d4",  "e6e4",  "e6f4",  "e6g4",  "e6c5",
+    "e6d5",  "e6e5",  "e6f5",  "e6g5",  "e6a6",  "e6b6",  "e6c6",  "e6d6",
+    "e6f6",  "e6g6",  "e6h6",  "e6c7",  "e6d7",  "e6e7",  "e6f7",  "e6g7",
+    "e6c8",  "e6d8",  "e6e8",  "e6f8",  "e6g8",  "f6a1",  "f6f1",  "f6b2",
+    "f6f2",  "f6c3",  "f6f3",  "f6d4",  "f6e4",  "f6f4",  "f6g4",  "f6h4",
+    "f6d5",  "f6e5",  "f6f5",  "f6g5",  "f6h5",  "f6a6",  "f6b6",  "f6c6",
+    "f6d6",  "f6e6",  "f6g6",  "f6h6",  "f6d7",  "f6e7",  "f6f7",  "f6g7",
+    "f6h7",  "f6d8",  "f6e8",  "f6f8",  "f6g8",  "f6h8",  "g6b1",  "g6g1",
+    "g6c2",  "g6g2",  "g6d3",  "g6g3",  "g6e4",  "g6f4",  "g6g4",  "g6h4",
+    "g6e5",  "g6f5",  "g6g5",  "g6h5",  "g6a6",  "g6b6",  "g6c6",  "g6d6",
+    "g6e6",  "g6f6",  "g6h6",  "g6e7",  "g6f7",  "g6g7",  "g6h7",  "g6e8",
+    "g6f8",  "g6g8",  "g6h8",  "h6c1",  "h6h1",  "h6d2",  "h6h2",  "h6e3",
+    "h6h3",  "h6f4",  "h6g4",  "h6h4",  "h6f5",  "h6g5",  "h6h5",  "h6a6",
+    "h6b6",  "h6c6",  "h6d6",  "h6e6",  "h6f6",  "h6g6",  "h6f7",  "h6g7",
+    "h6h7",  "h6f8",  "h6g8",  "h6h8",  "a7a1",  "a7g1",  "a7a2",  "a7f2",
+    "a7a3",  "a7e3",  "a7a4",  "a7d4",  "a7a5",  "a7b5",  "a7c5",  "a7a6",
+    "a7b6",  "a7c6",  "a7b7",  "a7c7",  "a7d7",  "a7e7",  "a7f7",  "a7g7",
+    "a7h7",  "a7a8",  "a7b8",  "a7c8",  "b7b1",  "b7h1",  "b7b2",  "b7g2",
+    "b7b3",  "b7f3",  "b7b4",  "b7e4",  "b7a5",  "b7b5",  "b7c5",  "b7d5",
+    "b7a6",  "b7b6",  "b7c6",  "b7d6",  "b7a7",  "b7c7",  "b7d7",  "b7e7",
+    "b7f7",  "b7g7",  "b7h7",  "b7a8",  "b7b8",  "b7c8",  "b7d8",  "c7c1",
+    "c7c2",  "c7h2",  "c7c3",  "c7g3",  "c7c4",  "c7f4",  "c7a5",  "c7b5",
+    "c7c5",  "c7d5",  "c7e5",  "c7a6",  "c7b6",  "c7c6",  "c7d6",  "c7e6",
+    "c7a7",  "c7b7",  "c7d7",  "c7e7",  "c7f7",  "c7g7",  "c7h7",  "c7a8",
+    "c7b8",  "c7c8",  "c7d8",  "c7e8",  "d7d1",  "d7d2",  "d7d3",  "d7h3",
+    "d7a4",  "d7d4",  "d7g4",  "d7b5",  "d7c5",  "d7d5",  "d7e5",  "d7f5",
+    "d7b6",  "d7c6",  "d7d6",  "d7e6",  "d7f6",  "d7a7",  "d7b7",  "d7c7",
+    "d7e7",  "d7f7",  "d7g7",  "d7h7",  "d7b8",  "d7c8",  "d7d8",  "d7e8",
+    "d7f8",  "e7e1",  "e7e2",  "e7a3",  "e7e3",  "e7b4",  "e7e4",  "e7h4",
+    "e7c5",  "e7d5",  "e7e5",  "e7f5",  "e7g5",  "e7c6",  "e7d6",  "e7e6",
+    "e7f6",  "e7g6",  "e7a7",  "e7b7",  "e7c7",  "e7d7",  "e7f7",  "e7g7",
+    "e7h7",  "e7c8",  "e7d8",  "e7e8",  "e7f8",  "e7g8",  "f7f1",  "f7a2",
+    "f7f2",  "f7b3",  "f7f3",  "f7c4",  "f7f4",  "f7d5",  "f7e5",  "f7f5",
+    "f7g5",  "f7h5",  "f7d6",  "f7e6",  "f7f6",  "f7g6",  "f7h6",  "f7a7",
+    "f7b7",  "f7c7",  "f7d7",  "f7e7",  "f7g7",  "f7h7",  "f7d8",  "f7e8",
+    "f7f8",  "f7g8",  "f7h8",  "g7a1",  "g7g1",  "g7b2",  "g7g2",  "g7c3",
+    "g7g3",  "g7d4",  "g7g4",  "g7e5",  "g7f5",  "g7g5",  "g7h5",  "g7e6",
+    "g7f6",  "g7g6",  "g7h6",  "g7a7",  "g7b7",  "g7c7",  "g7d7",  "g7e7",
+    "g7f7",  "g7h7",  "g7e8",  "g7f8",  "g7g8",  "g7h8",  "h7b1",  "h7h1",
+    "h7c2",  "h7h2",  "h7d3",  "h7h3",  "h7e4",  "h7h4",  "h7f5",  "h7g5",
+    "h7h5",  "h7f6",  "h7g6",  "h7h6",  "h7a7",  "h7b7",  "h7c7",  "h7d7",
+    "h7e7",  "h7f7",  "h7g7",  "h7f8",  "h7g8",  "h7h8",  "a8a1",  "a8h1",
+    "a8a2",  "a8g2",  "a8a3",  "a8f3",  "a8a4",  "a8e4",  "a8a5",  "a8d5",
+    "a8a6",  "a8b6",  "a8c6",  "a8a7",  "a8b7",  "a8c7",  "a8b8",  "a8c8",
+    "a8d8",  "a8e8",  "a8f8",  "a8g8",  "a8h8",  "b8b1",  "b8b2",  "b8h2",
+    "b8b3",  "b8g3",  "b8b4",  "b8f4",  "b8b5",  "b8e5",  "b8a6",  "b8b6",
+    "b8c6",  "b8d6",  "b8a7",  "b8b7",  "b8c7",  "b8d7",  "b8a8",  "b8c8",
+    "b8d8",  "b8e8",  "b8f8",  "b8g8",  "b8h8",  "c8c1",  "c8c2",  "c8c3",
+    "c8h3",  "c8c4",  "c8g4",  "c8c5",  "c8f5",  "c8a6",  "c8b6",  "c8c6",
+    "c8d6",  "c8e6",  "c8a7",  "c8b7",  "c8c7",  "c8d7",  "c8e7",  "c8a8",
+    "c8b8",  "c8d8",  "c8e8",  "c8f8",  "c8g8",  "c8h8",  "d8d1",  "d8d2",
+    "d8d3",  "d8d4",  "d8h4",  "d8a5",  "d8d5",  "d8g5",  "d8b6",  "d8c6",
+    "d8d6",  "d8e6",  "d8f6",  "d8b7",  "d8c7",  "d8d7",  "d8e7",  "d8f7",
+    "d8a8",  "d8b8",  "d8c8",  "d8e8",  "d8f8",  "d8g8",  "d8h8",  "e8e1",
+    "e8e2",  "e8e3",  "e8a4",  "e8e4",  "e8b5",  "e8e5",  "e8h5",  "e8c6",
+    "e8d6",  "e8e6",  "e8f6",  "e8g6",  "e8c7",  "e8d7",  "e8e7",  "e8f7",
+    "e8g7",  "e8a8",  "e8b8",  "e8c8",  "e8d8",  "e8f8",  "e8g8",  "e8h8",
+    "f8f1",  "f8f2",  "f8a3",  "f8f3",  "f8b4",  "f8f4",  "f8c5",  "f8f5",
+    "f8d6",  "f8e6",  "f8f6",  "f8g6",  "f8h6",  "f8d7",  "f8e7",  "f8f7",
+    "f8g7",  "f8h7",  "f8a8",  "f8b8",  "f8c8",  "f8d8",  "f8e8",  "f8g8",
+    "f8h8",  "g8g1",  "g8a2",  "g8g2",  "g8b3",  "g8g3",  "g8c4",  "g8g4",
+    "g8d5",  "g8g5",  "g8e6",  "g8f6",  "g8g6",  "g8h6",  "g8e7",  "g8f7",
+    "g8g7",  "g8h7",  "g8a8",  "g8b8",  "g8c8",  "g8d8",  "g8e8",  "g8f8",
+    "g8h8",  "h8a1",  "h8h1",  "h8b2",  "h8h2",  "h8c3",  "h8h3",  "h8d4",
+    "h8h4",  "h8e5",  "h8h5",  "h8f6",  "h8g6",  "h8h6",  "h8f7",  "h8g7",
+    "h8h7",  "h8a8",  "h8b8",  "h8c8",  "h8d8",  "h8e8",  "h8f8",  "h8g8",
+    "a7a8q", "a7a8r", "a7a8b", "a7b8q", "a7b8r", "a7b8b", "b7a8q", "b7a8r",
+    "b7a8b", "b7b8q", "b7b8r", "b7b8b", "b7c8q", "b7c8r", "b7c8b", "c7b8q",
+    "c7b8r", "c7b8b", "c7c8q", "c7c8r", "c7c8b", "c7d8q", "c7d8r", "c7d8b",
+    "d7c8q", "d7c8r", "d7c8b", "d7d8q", "d7d8r", "d7d8b", "d7e8q", "d7e8r",
+    "d7e8b", "e7d8q", "e7d8r", "e7d8b", "e7e8q", "e7e8r", "e7e8b", "e7f8q",
+    "e7f8r", "e7f8b", "f7e8q", "f7e8r", "f7e8b", "f7f8q", "f7f8r", "f7f8b",
+    "f7g8q", "f7g8r", "f7g8b", "g7f8q", "g7f8r", "g7f8b", "g7g8q", "g7g8r",
+    "g7g8b", "g7h8q", "g7h8r", "g7h8b", "h7g8q", "h7g8r", "h7g8b", "h7h8q",
+    "h7h8r", "h7h8b"};
+
+const std::array kPackedIdxToNNIdx = []() {
+  std::array<uint16_t, 64 * 64 * 4> indices;
+  size_t idx = 0;
+  for (const char* move_str : kMoveStrs) {
+    std::string move(move_str);
+    uint16_t from = Square::Parse(move.substr(0, 2)).as_idx();
+    uint16_t to = Square::Parse(move.substr(2, 2)).as_idx();
+    uint16_t promotion = move.size() == 5 ? PieceType::Parse(move[4]).idx : 0;
+    uint16_t packed_idx = promotion * 64 * 64 + from * 64 + to;
+    indices[packed_idx] = idx++;
+  }
+  return indices;
+}();
+
+uint16_t MoveAsPackedInt(Move move) {
+  enum Masks : uint16_t {
+    // clang-format off
+    kFromToMask  = 0b0000111111111111,
+    kPromotion   = 0b0100000000000000,
+    kPieceMask   = 0b0011000000000000,
+    // clang-format on
+  };
+  uint16_t val = move.raw_data();
+  return (val & kPromotion) ? (val & (kFromToMask | kPieceMask))
+                            : (val & kFromToMask);
+}
+
+Square Transform(Square sq, int transform) {
+  File file = sq.file();
+  Rank rank = sq.rank();
+  if ((transform & (MirrorTransform | TransposeTransform)) != 0) rank.Flip();
+  if ((transform & (FlipTransform | TransposeTransform)) != 0) file.Flop();
+  return Square(file, rank);
+}
+
+}  // namespace
+
+uint16_t MoveToNNIndex(Move move, int transform) {
+  if (transform == 0) return kPackedIdxToNNIdx[MoveAsPackedInt(move)];
+  const Square from = Transform(move.from(), transform);
+  const Square to = Transform(move.to(), transform);
+  const Move transformed =
+      move.is_promotion() ? Move::WhitePromotion(from, to, move.promotion())
+                          : Move::White(from, to);
+  return kPackedIdxToNNIdx[MoveAsPackedInt(transformed)];
+}
+
+Move MoveFromNNIndex(int idx, int transform) {
+  std::string m_str = kMoveStrs[idx];
+  auto from = Square::Parse(m_str.substr(0, 2));
+  auto to = Square::Parse(m_str.substr(2, 2));
+  if (transform != 0) {
+    int inv_transform;
+    if (transform & TransposeTransform) {
+      inv_transform = TransposeTransform;
+      if (transform & FlipTransform) inv_transform |= MirrorTransform;
+      if (transform & MirrorTransform) inv_transform |= FlipTransform;
+    } else {
+      inv_transform = transform;
+    }
+    to = Transform(to, inv_transform);
+    from = Transform(from, inv_transform);
+  }
+  return m_str.size() == 5
+             ? Move::WhitePromotion(from, to, PieceType::Parse(m_str[4]))
+             : Move::White(from, to);
+}
+
 }  // namespace lczero
diff --git a/src/neural/encoder.h b/src/neural/encoder.h
index e027a0743f..5782f48dd8 100644
--- a/src/neural/encoder.h
+++ b/src/neural/encoder.h
@@ -27,6 +27,8 @@
 
 #pragma once
 
+#include <span>
+
 #include "chess/position.h"
 #include "neural/network.h"
 #include "proto/net.pb.h"
@@ -49,10 +51,18 @@ InputPlanes EncodePositionForNN(
     const PositionHistory& history, int history_planes,
     FillEmptyHistory fill_empty_history, int* transform_out);
 
+InputPlanes EncodePositionForNN(
+    pblczero::NetworkFormat::InputFormat input_format,
+    std::span<const Position> positions, int history_planes,
+    FillEmptyHistory fill_empty_history, int* transform_out);
+
 bool IsCanonicalFormat(pblczero::NetworkFormat::InputFormat input_format);
 bool IsCanonicalArmageddonFormat(
     pblczero::NetworkFormat::InputFormat input_format);
 bool IsHectopliesFormat(pblczero::NetworkFormat::InputFormat input_format);
 bool Is960CastlingFormat(pblczero::NetworkFormat::InputFormat input_format);
 
+uint16_t MoveToNNIndex(Move move, int transform);
+Move MoveFromNNIndex(int idx, int transform);
+
 }  // namespace lczero
diff --git a/src/neural/encoder_test.cc b/src/neural/encoder_test.cc
index d41074479f..6d89bb14af 100644
--- a/src/neural/encoder_test.cc
+++ b/src/neural/encoder_test.cc
@@ -290,7 +290,7 @@ TEST(EncodePositionForNN, EncodeFiftyMoveCounter) {
   history.Reset(board, 0, 1);
 
   // 1. Nf3
-  history.Append(Move("g1f3", false));
+  history.Append(history.Last().GetBoard().ParseMove("g1f3"));
 
   InputPlanes encoded_planes =
       EncodePositionForNN(pblczero::NetworkFormat::INPUT_CLASSICAL_112_PLANE,
@@ -305,7 +305,7 @@ TEST(EncodePositionForNN, EncodeFiftyMoveCounter) {
   EXPECT_EQ(fifty_move_counter_plane.value, 1.0f);
 
   // 1. Nf3 Nf6
-  history.Append(Move("g8f6", true));
+  history.Append(history.Last().GetBoard().ParseMove("g8f6"));
 
   encoded_planes =
       EncodePositionForNN(pblczero::NetworkFormat::INPUT_CLASSICAL_112_PLANE,
@@ -326,7 +326,7 @@ TEST(EncodePositionForNN, EncodeFiftyMoveCounterFormat3) {
   history.Reset(board, 0, 1);
 
   // 1. Nf3
-  history.Append(Move("g1f3", false));
+  history.Append(history.Last().GetBoard().ParseMove("g1f3"));
 
   InputPlanes encoded_planes = EncodePositionForNN(
       pblczero::NetworkFormat::INPUT_112_WITH_CANONICALIZATION, history, 8,
@@ -340,7 +340,7 @@ TEST(EncodePositionForNN, EncodeFiftyMoveCounterFormat3) {
   EXPECT_EQ(fifty_move_counter_plane.value, 1.0f);
 
   // 1. Nf3 Nf6
-  history.Append(Move("g8f6", true));
+  history.Append(history.Last().GetBoard().ParseMove("g8f6"));
 
   encoded_planes = EncodePositionForNN(
       pblczero::NetworkFormat::INPUT_112_WITH_CANONICALIZATION, history, 8,
@@ -426,10 +426,10 @@ TEST(EncodePositionForNN, EncodeEnpassantFormat3) {
   board.SetFromFen(ChessBoard::kStartposFen);
   history.Reset(board, 0, 1);
   // Move to en passant.
-  history.Append(Move("e2e4", false));
-  history.Append(Move("g2g3", false));
-  history.Append(Move("e4e5", false));
-  history.Append(Move("f2f4", false));
+  history.Append(history.Last().GetBoard().ParseMove("e2e4"));
+  history.Append(history.Last().GetBoard().ParseMove("g7g6"));
+  history.Append(history.Last().GetBoard().ParseMove("e4e5"));
+  history.Append(history.Last().GetBoard().ParseMove("f7f5"));
 
   InputPlanes encoded_planes = EncodePositionForNN(
       pblczero::NetworkFormat::INPUT_112_WITH_CANONICALIZATION, history, 8,
@@ -447,7 +447,7 @@ TEST(EncodePositionForNN, EncodeEnpassantFormat3) {
   }
 
   // Boring move.
-  history.Append(Move("g1f3", false));
+  history.Append(history.Last().GetBoard().ParseMove("g1f3"));
 
   encoded_planes = EncodePositionForNN(
       pblczero::NetworkFormat::INPUT_112_WITH_CANONICALIZATION, history, 8,
@@ -466,7 +466,7 @@ TEST(EncodePositionForNN, EncodeEnpassantFormat3) {
   }
 
   // Another boring move.
-  history.Append(Move("g1f3", false));
+  history.Append(history.Last().GetBoard().ParseMove("g8f5"));
 
   encoded_planes = EncodePositionForNN(
       pblczero::NetworkFormat::INPUT_112_WITH_CANONICALIZATION, history, 8,
@@ -493,11 +493,11 @@ TEST(EncodePositionForNN, EncodeEarlyGameFlipFormat3) {
   board.SetFromFen(ChessBoard::kStartposFen);
   history.Reset(board, 0, 1);
   // Move to break castling and king offside.
-  history.Append(Move("e2e4", false));
-  history.Append(Move("e2e4", false));
-  history.Append(Move("e1e2", false));
-  history.Append(Move("e1e2", false));
-  history.Append(Move("e2d3", false));
+  history.Append(history.Last().GetBoard().ParseMove("e2e4"));
+  history.Append(history.Last().GetBoard().ParseMove("e7e5"));
+  history.Append(history.Last().GetBoard().ParseMove("e1e2"));
+  history.Append(history.Last().GetBoard().ParseMove("e8e7"));
+  history.Append(history.Last().GetBoard().ParseMove("e2d3"));
   // Their king offside, but not ours.
 
   int transform;
@@ -514,7 +514,7 @@ TEST(EncodePositionForNN, EncodeEarlyGameFlipFormat3) {
   EXPECT_EQ(their_king_plane.mask, 1ull << 43);
   EXPECT_EQ(their_king_plane.value, 1.0f);
 
-  history.Append(Move("e2e3", false));
+  history.Append(history.Last().GetBoard().ParseMove("e7e6"));
 
   // Our king offside, but theirs is not.
   encoded_planes = EncodePositionForNN(
diff --git a/src/neural/factory.cc b/src/neural/factory.cc
index 03c69ef682..778fb47c90 100644
--- a/src/neural/factory.cc
+++ b/src/neural/factory.cc
@@ -29,28 +29,14 @@
 
 #include <algorithm>
 
+#include "default_backend.h"
 #include "neural/loader.h"
+#include "neural/shared_params.h"
 #include "utils/commandline.h"
 #include "utils/logging.h"
 
 namespace lczero {
 
-const OptionId NetworkFactory::kWeightsId{
-    "weights", "WeightsFile",
-    "Path from which to load network weights.\nSetting it to <autodiscover> "
-    "makes it search in ./ and ./weights/ subdirectories for the latest (by "
-    "file date) file which looks like weights.",
-    'w'};
-const OptionId NetworkFactory::kBackendId{
-    "backend", "Backend", "Neural network computational backend to use.", 'b'};
-const OptionId NetworkFactory::kBackendOptionsId{
-    "backend-opts", "BackendOptions",
-    "Parameters of neural network backend. "
-    "Exact parameters differ per backend.",
-    'o'};
-const char* kAutoDiscover = "<autodiscover>";
-const char* kEmbed = "<built in>";
-
 NetworkFactory* NetworkFactory::Get() {
   static NetworkFactory factory;
   return &factory;
@@ -61,18 +47,6 @@ NetworkFactory::Register::Register(const std::string& name, FactoryFunc factory,
   NetworkFactory::Get()->RegisterNetwork(name, factory, priority);
 }
 
-void NetworkFactory::PopulateOptions(OptionsParser* options) {
-#if defined(EMBED)
-  options->Add<StringOption>(NetworkFactory::kWeightsId) = kEmbed;
-#else
-  options->Add<StringOption>(NetworkFactory::kWeightsId) = kAutoDiscover;
-#endif
-  const auto backends = NetworkFactory::Get()->GetBackendsList();
-  options->Add<ChoiceOption>(NetworkFactory::kBackendId, backends) =
-      backends.empty() ? "<none>" : backends[0];
-  options->Add<StringOption>(NetworkFactory::kBackendOptionsId);
-}
-
 void NetworkFactory::RegisterNetwork(const std::string& name,
                                      FactoryFunc factory, int priority) {
   factories_.emplace_back(name, factory, priority);
@@ -81,7 +55,15 @@ void NetworkFactory::RegisterNetwork(const std::string& name,
 
 std::vector<std::string> NetworkFactory::GetBackendsList() const {
   std::vector<std::string> result;
-  for (const auto& x : factories_) result.emplace_back(x.name);
+#ifdef DEFAULT_BACKEND
+  result.emplace_back(DEFAULT_BACKEND);
+#endif
+  for (const auto& x : factories_) {
+#ifdef DEFAULT_BACKEND
+    if (x.name == result[0]) continue;
+#endif
+    result.emplace_back(x.name);
+  }
   return result;
 }
 
@@ -99,9 +81,10 @@ std::unique_ptr<Network> NetworkFactory::Create(
 
 NetworkFactory::BackendConfiguration::BackendConfiguration(
     const OptionsDict& options)
-    : weights_path(options.Get<std::string>(kWeightsId)),
-      backend(options.Get<std::string>(kBackendId)),
-      backend_options(options.Get<std::string>(kBackendOptionsId)) {}
+    : weights_path(options.Get<std::string>(SharedBackendParams::kWeightsId)),
+      backend(options.Get<std::string>(SharedBackendParams::kBackendId)),
+      backend_options(
+          options.Get<std::string>(SharedBackendParams::kBackendOptionsId)) {}
 
 bool NetworkFactory::BackendConfiguration::operator==(
     const BackendConfiguration& other) const {
@@ -111,27 +94,20 @@ bool NetworkFactory::BackendConfiguration::operator==(
 
 std::unique_ptr<Network> NetworkFactory::LoadNetwork(
     const OptionsDict& options) {
-  std::string net_path = options.Get<std::string>(kWeightsId);
-  const std::string backend = options.Get<std::string>(kBackendId);
+  std::string net_path =
+      options.Get<std::string>(SharedBackendParams::kWeightsId);
+  const std::string backend =
+      options.Get<std::string>(SharedBackendParams::kBackendId);
   const std::string backend_options =
-      options.Get<std::string>(kBackendOptionsId);
-
-  if (net_path == kAutoDiscover) {
-    net_path = DiscoverWeightsFile();
-  } else if (net_path == kEmbed) {
-    net_path = CommandLine::BinaryName();
-  } else {
-    CERR << "Loading weights file from: " << net_path;
-  }
-  std::optional<WeightsFile> weights;
-  if (!net_path.empty()) {
-    weights = LoadWeightsFromFile(net_path);
-  }
+      options.Get<std::string>(SharedBackendParams::kBackendOptionsId);
 
+  std::optional<WeightsFile> weights;
+  if (!net_path.empty()) weights = LoadWeights(net_path);
   OptionsDict network_options(&options);
   network_options.AddSubdictFromString(backend_options);
 
-  auto ptr = NetworkFactory::Get()->Create(backend, weights, network_options);
+  auto ptr = NetworkFactory::Get()->Create(backend, std::move(weights),
+                                           network_options);
   network_options.CheckAllOptionsRead(backend);
   return ptr;
 }
diff --git a/src/neural/factory.h b/src/neural/factory.h
index 4448aa3224..fb52c29bb1 100644
--- a/src/neural/factory.h
+++ b/src/neural/factory.h
@@ -33,6 +33,7 @@
 
 #include "neural/loader.h"
 #include "neural/network.h"
+#include "neural/wrapper.h"
 #include "utils/optionsdict.h"
 #include "utils/optionsparser.h"
 
@@ -55,9 +56,6 @@ class NetworkFactory {
     Register(const std::string& name, FactoryFunc factory, int priority = 0);
   };
 
-  // Add the network/backend parameters to the options dictionary.
-  static void PopulateOptions(OptionsParser* options);
-
   // Returns list of backend names, sorted by priority (higher priority first).
   std::vector<std::string> GetBackendsList() const;
 
@@ -70,11 +68,6 @@ class NetworkFactory {
   // if no network options changed since the previous call.
   static std::unique_ptr<Network> LoadNetwork(const OptionsDict& options);
 
-  // Parameter IDs.
-  static const OptionId kWeightsId;
-  static const OptionId kBackendId;
-  static const OptionId kBackendOptionsId;
-
   struct BackendConfiguration {
     BackendConfiguration() = default;
     BackendConfiguration(const OptionsDict& options);
@@ -115,15 +108,25 @@ class NetworkFactory {
   friend class Register;
 };
 
-#define REGISTER_NETWORK_WITH_COUNTER2(name, func, priority, counter) \
-  namespace {                                                         \
-  static NetworkFactory::Register regH38fhs##counter(                 \
-      name,                                                           \
-      [](const std::optional<WeightsFile>& w, const OptionsDict& o) { \
-        return func(w, o);                                            \
-      },                                                              \
-      priority);                                                      \
+#define REGISTER_NETWORK_WITH_COUNTER2(name, func, priority, counter)       \
+  namespace {                                                               \
+  namespace ns##counter {                                                   \
+    [[maybe_unused]] static NetworkFactory::Register regH38fhs##counter(    \
+        name,                                                               \
+        [](const std::optional<WeightsFile>& w, const OptionsDict& o) {     \
+          return func(w, o);                                                \
+        },                                                                  \
+        priority);                                                          \
+    [[maybe_unused]] static BackendManager::Register regK03nv##counter(     \
+        std::make_unique<NetworkAsBackendFactory>(                          \
+            name,                                                           \
+            [](const std::optional<WeightsFile>& w, const OptionsDict& o) { \
+              return func(w, o);                                            \
+            },                                                              \
+            priority));                                                     \
+  }                                                                         \
   }
+
 #define REGISTER_NETWORK_WITH_COUNTER(name, func, priority, counter) \
   REGISTER_NETWORK_WITH_COUNTER2(name, func, priority, counter)
 
diff --git a/src/neural/loader.cc b/src/neural/loader.cc
index c88d985c2b..9b706cabed 100644
--- a/src/neural/loader.cc
+++ b/src/neural/loader.cc
@@ -37,11 +37,12 @@
 #include <sstream>
 #include <string>
 
+#include "neural/shared_params.h"
 #include "proto/net.pb.h"
 #include "utils/commandline.h"
 #include "utils/exception.h"
 #include "utils/filesystem.h"
-#include "utils/logging.h"
+#include "utils/optionsdict.h"
 #include "version.h"
 
 #ifdef _WIN32
@@ -127,13 +128,11 @@ void FixOlderWeightsFile(WeightsFile* file) {
     net->set_network(nf::NETWORK_SE_WITH_HEADFORMAT);
     net->set_value(nf::VALUE_CLASSICAL);
     net->set_policy(nf::POLICY_CLASSICAL);
-  } else if (network_format ==
-                 nf::NETWORK_SE_WITH_HEADFORMAT &&
+  } else if (network_format == nf::NETWORK_SE_WITH_HEADFORMAT &&
              file->weights().encoder().size() > 0) {
     // Attention body network made with old protobuf.
     auto* net = file->mutable_format()->mutable_network_format();
-    net->set_network(
-        nf::NETWORK_ATTENTIONBODY_WITH_HEADFORMAT);
+    net->set_network(nf::NETWORK_ATTENTIONBODY_WITH_HEADFORMAT);
     if (file->weights().has_smolgen_w()) {
       // Need to override activation defaults for smolgen.
       net->set_ffn_activation(nf::ACTIVATION_RELU_2);
@@ -183,7 +182,8 @@ WeightsFile ParseWeightsProto(const std::string& buffer) {
   }
 
   if (net.has_weights() &&
-      net.format().weights_encoding() != pblczero::Format::LINEAR16) {
+      net.format().weights_encoding() != pblczero::Format::LINEAR16 &&
+      net_ver < GetVersionInt(0, 33, 0)) {
     throw Exception("Invalid weight file: unsupported encoding.");
   }
 
@@ -211,6 +211,22 @@ WeightsFile LoadWeightsFromFile(const std::string& filename) {
   return ParseWeightsProto(buffer);
 }
 
+std::optional<WeightsFile> LoadWeights(std::string_view location) {
+  std::string net_path = std::string(location);
+  if (net_path == SharedBackendParams::kAutoDiscover) {
+    net_path = DiscoverWeightsFile();
+  } else if (net_path == SharedBackendParams::kEmbed) {
+    net_path = CommandLine::BinaryName();
+  }
+  if (net_path.empty()) return std::nullopt;
+  if (location == SharedBackendParams::kEmbed) {
+    CERR << "Using embedded weights from binary: " << net_path;
+  } else {
+    CERR << "Loading weights file from: " << net_path;
+  }
+  return LoadWeightsFromFile(net_path);
+}
+
 std::string DiscoverWeightsFile() {
   const int kMinFileSize = 500000;  // 500 KB
 
@@ -249,24 +265,12 @@ std::string DiscoverWeightsFile() {
       gzclose(file);
       if (sz < 0) continue;
 
-      std::string str(buf, buf + sz);
-      std::istringstream data(str);
-      int val = 0;
-      data >> val;
-      if (!data.fail() && val == 2) {
-        CERR << "Found txt network file: " << candidate.second;
-        return candidate.second;
-      }
-
       // First byte of the protobuf stream is 0x0d for fixed32, so we ignore it
       // as our own magic should suffice.
       const auto magic = buf[1] | (static_cast<uint32_t>(buf[2]) << 8) |
                          (static_cast<uint32_t>(buf[3]) << 16) |
                          (static_cast<uint32_t>(buf[4]) << 24);
-      if (magic == kWeightMagic) {
-        CERR << "Found pb network file: " << candidate.second;
-        return candidate.second;
-      }
+      if (magic == kWeightMagic) return candidate.second;
     }
   }
   LOGFILE << "Network weights file not found.";
diff --git a/src/neural/loader.h b/src/neural/loader.h
index 279e87e303..55ae8d2aa0 100644
--- a/src/neural/loader.h
+++ b/src/neural/loader.h
@@ -27,7 +27,9 @@
 
 #pragma once
 
+#include <optional>
 #include <string>
+#include <string_view>
 #include <vector>
 
 #include "neural/network.h"
@@ -35,6 +37,7 @@
 
 namespace lczero {
 
+class OptionsDict;
 using FloatVector = std::vector<float>;
 using FloatVectors = std::vector<FloatVector>;
 
@@ -43,6 +46,13 @@ using WeightsFile = pblczero::Net;
 // Read weights file and fill the weights structure.
 WeightsFile LoadWeightsFromFile(const std::string& filename);
 
+// Read weights from the "locations", which is one of:
+// * "<autodiscover>" -- tries to find a file which looks like a weights file.
+// * "<embed>" -- weights are embedded in the binary.
+// * filename -- reads weights from the file.
+// Returns std::nullopt if no weights file was found in <autodiscover> mode.
+std::optional<WeightsFile> LoadWeights(std::string_view location);
+
 // Tries to find a file which looks like a weights file, and located in
 // directory of binary_name or one of subdirectories. If there are several such
 // files, returns one which has the latest modification date.
diff --git a/src/neural/memcache.cc b/src/neural/memcache.cc
new file mode 100644
index 0000000000..c64dd1696f
--- /dev/null
+++ b/src/neural/memcache.cc
@@ -0,0 +1,193 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2025 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include "neural/memcache.h"
+
+#include "neural/shared_params.h"
+#include "utils/atomic_vector.h"
+#include "utils/cache.h"
+#include "utils/smallarray.h"
+
+namespace lczero {
+namespace {
+
+// TODO For now it uses the hash of the current position, ignoring repetitions
+// and history. We'll likely need to have configurable hash function that we'll
+// also reuse as a tree hash key.
+uint64_t ComputeEvalPositionHash(const EvalPosition& pos) {
+  return pos.pos.back().Hash();
+}
+
+struct CachedValue {
+  float q;
+  float d;
+  float m;
+  uint8_t num_moves;
+  std::unique_ptr<float[]> p;
+};
+
+void CachedValueToEvalResult(const CachedValue& cv, const EvalResultPtr& ptr) {
+  if (ptr.d) *ptr.d = cv.d;
+  if (ptr.q) *ptr.q = cv.q;
+  if (ptr.m) *ptr.m = cv.m;
+  std::copy(cv.p.get(), cv.p.get() + ptr.p.size(), ptr.p.begin());
+}
+
+class MemCache : public CachingBackend {
+ public:
+  MemCache(std::unique_ptr<Backend> wrapped, const OptionsDict& options)
+      : wrapped_backend_(std::move(wrapped)),
+        cache_(options.Get<int>(SharedBackendParams::kNNCacheSizeId)),
+        max_batch_size_(wrapped_backend_->GetAttributes().maximum_batch_size) {}
+
+  BackendAttributes GetAttributes() const override {
+    return wrapped_backend_->GetAttributes();
+  }
+  std::unique_ptr<BackendComputation> CreateComputation() override;
+  std::optional<EvalResult> GetCachedEvaluation(const EvalPosition&) override;
+
+  void ClearCache() override { cache_.Clear(); }
+
+  UpdateConfigurationResult UpdateConfiguration(
+      const OptionsDict& options) override {
+    auto ret = wrapped_backend_->UpdateConfiguration(options);
+    if (ret == Backend::UPDATE_OK) {
+      // Check if we need to clear the cache.
+      if (!wrapped_backend_->IsSameConfiguration(options)) {
+        cache_.Clear();
+      }
+    }
+    return ret;
+  }
+
+  bool IsSameConfiguration(const OptionsDict& options) const override {
+    return wrapped_backend_->IsSameConfiguration(options);
+  }
+
+  void SetCacheSize(size_t size) override { cache_.SetCapacity(size); }
+
+ private:
+  std::unique_ptr<Backend> wrapped_backend_;
+  HashKeyedCache<CachedValue> cache_;
+  const size_t max_batch_size_;
+  friend class MemCacheComputation;
+};
+
+class MemCacheComputation : public BackendComputation {
+ public:
+  MemCacheComputation(std::unique_ptr<BackendComputation> wrapped_computation,
+                      MemCache* memcache)
+      : wrapped_computation_(std::move(wrapped_computation)),
+        memcache_(memcache),
+        entries_(memcache->max_batch_size_) {}
+
+ private:
+  size_t UsedBatchSize() const override {
+    return wrapped_computation_->UsedBatchSize();
+  }
+  virtual AddInputResult AddInput(const EvalPosition& pos,
+                                  EvalResultPtr result) override {
+    assert(pos.legal_moves.size() == result.p.size() || result.p.empty());
+    const uint64_t hash = ComputeEvalPositionHash(pos);
+    {
+      HashKeyedCacheLock<CachedValue> lock(&memcache_->cache_, hash);
+      // Sometimes search queries NN without passing the legal moves. It is
+      // still cached in this case, but in subsequent queries we only return it
+      // if legal moves are not passed again. Otherwise check the size to guard
+      // against hash collisions.
+      if (lock.holds_value() &&
+          (pos.legal_moves.empty() ||
+           (lock->p && lock->num_moves == pos.legal_moves.size()))) {
+        CachedValueToEvalResult(**lock, result);
+        return AddInputResult::FETCHED_IMMEDIATELY;
+      }
+    }
+    size_t entry_idx = entries_.emplace_back(
+        Entry{hash, std::make_unique<CachedValue>(), result});
+    auto& value = entries_[entry_idx].value;
+    value->p.reset(pos.legal_moves.empty() ? nullptr
+                                           : new float[pos.legal_moves.size()]);
+    value->num_moves = pos.legal_moves.size();
+    return wrapped_computation_->AddInput(
+        pos, EvalResultPtr{&value->q, &value->d, &value->m,
+                           value->p ? std::span<float>{value->p.get(),
+                                                       pos.legal_moves.size()}
+                                    : std::span<float>{}});
+  }
+
+  virtual void ComputeBlocking() override {
+    wrapped_computation_->ComputeBlocking();
+    for (auto& entry : entries_) {
+      CachedValueToEvalResult(*entry.value, entry.result_ptr);
+      memcache_->cache_.Insert(entry.key, std::move(entry.value));
+    }
+  }
+
+  struct Entry {
+    uint64_t key;
+    std::unique_ptr<CachedValue> value;
+    EvalResultPtr result_ptr;
+  };
+
+  std::unique_ptr<BackendComputation> wrapped_computation_;
+  MemCache* memcache_;
+  AtomicVector<Entry> entries_;
+};
+
+std::unique_ptr<BackendComputation> MemCache::CreateComputation() {
+  return std::make_unique<MemCacheComputation>(
+      wrapped_backend_->CreateComputation(), this);
+}
+std::optional<EvalResult> MemCache::GetCachedEvaluation(
+    const EvalPosition& pos) {
+  const uint64_t hash = ComputeEvalPositionHash(pos);
+  HashKeyedCacheLock<CachedValue> lock(&cache_, hash);
+  if (!lock.holds_value() ||
+      (!pos.legal_moves.empty() &&
+       !(lock->p && lock->num_moves == pos.legal_moves.size()))) {
+    return std::nullopt;
+  }
+  EvalResult result;
+  result.d = lock->d;
+  result.q = lock->q;
+  result.m = lock->m;
+  if (lock->p) {
+    result.p.reserve(pos.legal_moves.size());
+    std::copy(lock->p.get(), lock->p.get() + pos.legal_moves.size(),
+              std::back_inserter(result.p));
+  }
+  return result;
+}
+
+}  // namespace
+
+std::unique_ptr<CachingBackend> CreateMemCache(std::unique_ptr<Backend> wrapped,
+                                               const OptionsDict& options) {
+  return std::make_unique<MemCache>(std::move(wrapped), options);
+}
+
+}  // namespace lczero
diff --git a/src/neural/memcache.h b/src/neural/memcache.h
new file mode 100644
index 0000000000..2750975841
--- /dev/null
+++ b/src/neural/memcache.h
@@ -0,0 +1,47 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2025 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#pragma once
+
+#include "neural/backend.h"
+
+namespace lczero {
+
+class CachingBackend : public Backend {
+ public:
+  // Clears the cache.
+  virtual void ClearCache() = 0;
+  virtual void SetCacheSize(size_t size) = 0;
+};
+
+// Creates a caching backend wrapper, which returns values immediately if they
+// are found, and forwards the request to the wrapped backend otherwise (and
+// caches the result).
+std::unique_ptr<CachingBackend> CreateMemCache(std::unique_ptr<Backend> parent,
+                                               const OptionsDict& options);
+
+}  // namespace lczero
\ No newline at end of file
diff --git a/src/neural/mock_backend.h b/src/neural/mock_backend.h
new file mode 100644
index 0000000000..78bbbdfee2
--- /dev/null
+++ b/src/neural/mock_backend.h
@@ -0,0 +1,65 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2024 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#pragma once
+
+#include "gmock/gmock.h"
+
+#include "neural/backend.h"
+
+namespace lczero {
+
+class MockBackendComputation : public BackendComputation {
+ public:
+  MOCK_METHOD(size_t, UsedBatchSize, (), (const, override));
+  MOCK_METHOD(AddInputResult, AddInput,
+              (const EvalPosition& pos, EvalResultPtr result), (override));
+  MOCK_METHOD(void, ComputeBlocking, (), (override));
+};
+
+class MockBackend : public Backend {
+ public:
+  MOCK_METHOD(BackendAttributes, GetAttributes, (), (const, override));
+  MOCK_METHOD(std::unique_ptr<BackendComputation>, CreateComputation, (),
+              (override));
+  MOCK_METHOD(std::vector<EvalResult>, EvaluateBatch,
+              (std::span<const EvalPosition> positions), (override));
+  MOCK_METHOD(std::optional<EvalResult>, GetCachedEvaluation,
+              (const EvalPosition&), (override));
+  MOCK_METHOD(UpdateConfigurationResult, UpdateConfiguration,
+              (const OptionsDict&), (override));
+};
+
+class MockBackendFactory : public BackendFactory {
+ public:
+  MOCK_METHOD(int, GetPriority, (), (const, override));
+  MOCK_METHOD(std::string_view, GetName, (), (const, override));
+  MOCK_METHOD(std::unique_ptr<Backend>, Create, (const OptionsDict&),
+              (override));
+};
+
+}  // namespace lczero
diff --git a/src/neural/network.h b/src/neural/network.h
index b46e63a745..becf424427 100644
--- a/src/neural/network.h
+++ b/src/neural/network.h
@@ -121,6 +121,7 @@ class Network {
   virtual void InitThread(int /*id*/) {}
   virtual bool IsCpu() const { return false; }
   virtual int GetMiniBatchSize() const { return 256; }
+  virtual int GetPreferredBatchStep() const { return 1; }
   virtual ~Network() = default;
 };
 
diff --git a/src/neural/network_demux.cc b/src/neural/network_demux.cc
deleted file mode 100644
index a1a28f779f..0000000000
--- a/src/neural/network_demux.cc
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
-  This file is part of Leela Chess Zero.
-  Copyright (C) 2018-2020 The LCZero Authors
-
-  Leela Chess is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  Leela Chess is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
-
-  Additional permission under GNU GPL version 3 section 7
-
-  If you modify this Program, or any covered work, by linking or
-  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
-  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
-  modified version of those libraries), containing parts covered by the
-  terms of the respective license agreement, the licensors of this
-  Program grant you additional permission to convey the resulting work.
-*/
-
-#include <condition_variable>
-#include <queue>
-#include <thread>
-
-#include "neural/factory.h"
-#include "utils/exception.h"
-
-namespace lczero {
-namespace {
-
-class DemuxingNetwork;
-class DemuxingComputation : public NetworkComputation {
- public:
-  DemuxingComputation(DemuxingNetwork* network) : network_(network) {}
-
-  void AddInput(InputPlanes&& input) override { planes_.emplace_back(input); }
-
-  void ComputeBlocking() override;
-
-  int GetBatchSize() const override { return planes_.size(); }
-
-  float GetQVal(int sample) const override {
-    const int idx = sample / partial_size_;
-    const int offset = sample % partial_size_;
-    return parents_[idx]->GetQVal(offset);
-  }
-
-  float GetDVal(int sample) const override {
-    int idx = sample / partial_size_;
-    int offset = sample % partial_size_;
-    return parents_[idx]->GetDVal(offset);
-  }
-
-  float GetMVal(int sample) const override {
-    int idx = sample / partial_size_;
-    int offset = sample % partial_size_;
-    return parents_[idx]->GetMVal(offset);
-  }
-
-  float GetPVal(int sample, int move_id) const override {
-    const int idx = sample / partial_size_;
-    const int offset = sample % partial_size_;
-    return parents_[idx]->GetPVal(offset, move_id);
-  }
-
-  void NotifyComplete() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    dataready_--;
-    if (dataready_ == 0) {
-      dataready_cv_.notify_one();
-    }
-  }
-
-  NetworkComputation* AddParentFromNetwork(Network* network) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    parents_.emplace_back(network->NewComputation());
-    const int cur_idx = (parents_.size() - 1) * partial_size_;
-    for (int i = cur_idx; i < std::min(GetBatchSize(), cur_idx + partial_size_);
-         i++) {
-      parents_.back()->AddInput(std::move(planes_[i]));
-    }
-    return parents_.back().get();
-  }
-
- private:
-  std::vector<InputPlanes> planes_;
-  DemuxingNetwork* network_;
-  std::vector<std::unique_ptr<NetworkComputation>> parents_;
-
-  std::mutex mutex_;
-  std::condition_variable dataready_cv_;
-  int dataready_ = 0;
-  int partial_size_ = 0;
-};
-
-class DemuxingNetwork : public Network {
- public:
-  DemuxingNetwork(const std::optional<WeightsFile>& weights,
-                  const OptionsDict& options) {
-    minimum_split_size_ = options.GetOrDefault<int>("minimum-split-size", 0);
-    const auto parents = options.ListSubdicts();
-    if (parents.empty()) {
-      // If options are empty, or multiplexer configured in root object,
-      // initialize on root object and default backend.
-      auto backends = NetworkFactory::Get()->GetBackendsList();
-      AddBackend(backends[0], weights, options);
-    }
-
-    for (const auto& name : parents) {
-      AddBackend(name, weights, options.GetSubdict(name));
-    }
-  }
-
-  void AddBackend(const std::string& name,
-                  const std::optional<WeightsFile>& weights,
-                  const OptionsDict& opts) {
-    const std::string backend = opts.GetOrDefault<std::string>("backend", name);
-
-    networks_.emplace_back(
-        NetworkFactory::Get()->Create(backend, weights, opts));
-
-    int nn_threads = opts.GetOrDefault<int>("threads", 0);
-    if (nn_threads == 0) {
-      nn_threads = networks_.back()->GetThreads();
-    }
-
-    min_batch_size_ =
-        std::min(min_batch_size_, networks_.back()->GetMiniBatchSize());
-    is_cpu_ &= networks_.back()->IsCpu();
-
-    if (networks_.size() == 1) {
-      capabilities_ = networks_.back()->GetCapabilities();
-    } else {
-      capabilities_.Merge(networks_.back()->GetCapabilities());
-    }
-
-    for (int i = 0; i < nn_threads; ++i) {
-      threads_.emplace_back([this]() { Worker(); });
-    }
-  }
-
-  std::unique_ptr<NetworkComputation> NewComputation() override {
-    return std::make_unique<DemuxingComputation>(this);
-  }
-
-  const NetworkCapabilities& GetCapabilities() const override {
-    return capabilities_;
-  }
-
-  int GetMiniBatchSize() const override {
-    return min_batch_size_ * threads_.size();
-  }
-
-  bool IsCpu() const override { return is_cpu_; }
-
-  void Enqueue(DemuxingComputation* computation) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    queue_.push(computation);
-    cv_.notify_one();
-  }
-
-  ~DemuxingNetwork() {
-    Abort();
-    Wait();
-    // Unstuck waiting computations.
-    while (!queue_.empty()) {
-      queue_.front()->NotifyComplete();
-      queue_.pop();
-    }
-  }
-
-  void Worker() {
-    // While Abort() is not called (and it can only be called from destructor).
-    while (!abort_) {
-      {
-        {
-          std::unique_lock<std::mutex> lock(mutex_);
-          // Wait until there's come work to compute.
-          cv_.wait(lock, [&] { return abort_ || !queue_.empty(); });
-          if (abort_) break;
-        }
-
-        // While there is a work in queue, process it.
-        while (true) {
-          DemuxingComputation* to_notify;
-          {
-            std::unique_lock<std::mutex> lock(mutex_);
-            if (queue_.empty()) break;
-            to_notify = queue_.front();
-            queue_.pop();
-          }
-          long long net_idx = ++(counter_) % networks_.size();
-          NetworkComputation* to_compute =
-              to_notify->AddParentFromNetwork(networks_[net_idx].get());
-          to_compute->ComputeBlocking();
-          to_notify->NotifyComplete();
-        }
-      }
-    }
-  }
-
-  void Abort() {
-    {
-      std::lock_guard<std::mutex> lock(mutex_);
-      abort_ = true;
-    }
-    cv_.notify_all();
-  }
-
-  void Wait() {
-    while (!threads_.empty()) {
-      threads_.back().join();
-      threads_.pop_back();
-    }
-  }
-
-  std::vector<std::unique_ptr<Network>> networks_;
-  NetworkCapabilities capabilities_;
-  int min_batch_size_ = std::numeric_limits<int>::max();
-  bool is_cpu_ = true;
-  std::queue<DemuxingComputation*> queue_;
-  int minimum_split_size_ = 0;
-  std::atomic<long long> counter_;
-  bool abort_ = false;
-
-  std::mutex mutex_;
-  std::condition_variable cv_;
-
-  std::vector<std::thread> threads_;
-};
-
-void DemuxingComputation::ComputeBlocking() {
-  if (GetBatchSize() == 0) return;
-  partial_size_ = (GetBatchSize() + network_->threads_.size() - 1) /
-                  network_->threads_.size();
-  if (partial_size_ < network_->minimum_split_size_) {
-    partial_size_ = std::min(GetBatchSize(), network_->minimum_split_size_);
-  }
-  const int splits = (GetBatchSize() + partial_size_ - 1) / partial_size_;
-
-  std::unique_lock<std::mutex> lock(mutex_);
-  dataready_ = splits;
-  for (int j = 0; j < splits; j++) {
-    network_->Enqueue(this);
-  }
-  dataready_cv_.wait(lock, [this]() { return dataready_ == 0; });
-}
-
-std::unique_ptr<Network> MakeDemuxingNetwork(
-    const std::optional<WeightsFile>& weights, const OptionsDict& options) {
-  return std::make_unique<DemuxingNetwork>(weights, options);
-}
-
-REGISTER_NETWORK("demux", MakeDemuxingNetwork, -1001)
-
-}  // namespace
-}  // namespace lczero
diff --git a/src/neural/network_legacy.cc b/src/neural/network_legacy.cc
index 53846353c6..8c54b64973 100644
--- a/src/neural/network_legacy.cc
+++ b/src/neural/network_legacy.cc
@@ -142,7 +142,11 @@ BaseWeights::MHA::MHA(const pblczero::Weights::MHA& mha)
       dense_w(LayerAdapter(mha.dense_w()).as_vector()),
       dense_b(LayerAdapter(mha.dense_b()).as_vector()),
       smolgen(Smolgen(mha.smolgen())),
-      has_smolgen(mha.has_smolgen()) {}
+      has_smolgen(mha.has_smolgen()) {
+  if (mha.has_rpe_q() || mha.has_rpe_k() || mha.has_rpe_v()) {
+    throw Exception("RPE weights file not supported.");
+  }
+}
 
 BaseWeights::FFN::FFN(const pblczero::Weights::FFN& ffn)
     : dense1_w(LayerAdapter(ffn.dense1_w()).as_vector()),
diff --git a/src/neural/onnx/adapters.h b/src/neural/onnx/adapters.h
index e83a9385a7..fc04096d9f 100644
--- a/src/neural/onnx/adapters.h
+++ b/src/neural/onnx/adapters.h
@@ -30,8 +30,8 @@
 #include <initializer_list>
 
 #include "neural/onnx/builder.h"
-#include "neural/onnx/onnx.pb.h"
 #include "proto/net.pb.h"
+#include "proto/onnx.pb.h"
 #include "utils/weights_adapter.h"
 
 namespace lczero {
diff --git a/src/neural/onnx/builder.cc b/src/neural/onnx/builder.cc
index fe09d5cb1c..94b7db650a 100644
--- a/src/neural/onnx/builder.cc
+++ b/src/neural/onnx/builder.cc
@@ -30,24 +30,30 @@
 #include <initializer_list>
 
 #include "neural/onnx/adapters.h"
-#include "neural/onnx/onnx.pb.h"
 #include "utils/exception.h"
-#include "utils/random.h"
 #include "version.h"
 
 namespace lczero {
 
-OnnxBuilder::OnnxBuilder(int opset) : opset_(opset) {
+OnnxBuilder::OnnxBuilder(int opset, int ir) : opset_(opset) {
   if (opset < 7 || opset > 22) {
     throw Exception("Only ONNX opsets between 7 and 22 are supported.");
   }
-  model_.set_ir_version(4);
+  // Map of latest opset corresponding to IR version.
+  std::map<int, int> opset_to_ir = {{8, 3},  {9, 4},   {10, 5},
+                                    {11, 6}, {14, 7},  {18, 8},
+                                    {20, 9}, {22, 10}, {99, 11}};
+  if (ir < 0) ir = opset_to_ir.upper_bound(opset - 1)->second;
+  if (ir < 3 || ir > 10) {
+    throw Exception("Only ONNX IR between 3 and 10 is supported.");
+  }
+  model_.set_ir_version(ir);
   model_.set_domain("org.lczero.models.*");
   model_.set_producer_name("Lc0");
   model_.set_producer_version(GetVersionStr());
   model_.add_opset_import()->set_version(opset);
-  model_.mutable_graph()->set_name("org.lczero/converted/" +
-                                   Random::Get().GetString(16));
+  // TODO change to real network name when it becomes available.
+  model_.mutable_graph()->set_name("org.lczero/converted");
 }
 
 namespace {
diff --git a/src/neural/onnx/builder.h b/src/neural/onnx/builder.h
index 4ada3c37f7..7fa7323306 100644
--- a/src/neural/onnx/builder.h
+++ b/src/neural/onnx/builder.h
@@ -30,7 +30,7 @@
 #include <initializer_list>
 #include <string>
 
-#include "neural/onnx/onnx.pb.h"
+#include "proto/onnx.pb.h"
 
 namespace lczero {
 
@@ -45,7 +45,7 @@ class OnnxConst {
 // Builds Onnx::ModelProto.
 class OnnxBuilder {
  public:
-  OnnxBuilder(int opset);
+  OnnxBuilder(int opset, int ir = -1);
   void AddInput(const std::string& name, std::initializer_list<int> dims,
                 pblczero::TensorProto::DataType datatype);
   void AddOutput(const std::string& name, std::initializer_list<int> dims,
diff --git a/src/neural/onnx/converter.cc b/src/neural/onnx/converter.cc
index 605e77fbd5..c59069842a 100644
--- a/src/neural/onnx/converter.cc
+++ b/src/neural/onnx/converter.cc
@@ -38,9 +38,9 @@
 #include "neural/network_legacy.h"
 #include "neural/onnx/adapters.h"
 #include "neural/onnx/builder.h"
-#include "neural/shared/activation.h"
-#include "neural/shared/attention_policy_map.h"
-#include "neural/shared/policy_map.h"
+#include "neural/tables/activation_function.h"
+#include "neural/tables/attention_policy_map.h"
+#include "neural/tables/policy_map.h"
 #include "proto/net.pb.h"
 #include "utils/bf16_utils.h"
 #include "utils/exception.h"
@@ -250,11 +250,10 @@ std::string Converter::EndOptionalBf16Fix(OnnxBuilder* builder,
 
 std::string Converter::MakeMish(OnnxBuilder* builder, const std::string& input,
                                 const std::string& name) {
-  if (!options_.alt_mish || options_.opset < 9 ||
-      options_.data_type != WeightsToOnnxConverterOptions::DataType::kFloat32) {
+  if (!options_.alt_mish) {
     std::string flow = input;
     flow = StartOptionalBf16Fix(builder, flow, name);
-    if (options_.opset >= 18) {
+    if (options_.opset >= 18 && options_.real_mish) {
       flow = builder->Mish(name, flow);
       return EndOptionalBf16Fix(builder, flow, name);
     }
@@ -263,20 +262,16 @@ std::string Converter::MakeMish(OnnxBuilder* builder, const std::string& input,
     flow = builder->Tanh(name + "/tanh", flow);
     return builder->Mul(name, flow, input);
   } else {
-    const OnnxConst& two =
-        static_cast<const OnnxConst&>(FloatOnnxConst({2.0f}, {1}));
-    const OnnxConst& zero =
-        static_cast<const OnnxConst&>(FloatOnnxConst({0.0f}, {1}));
-    auto e = builder->Exp(name + "/exp", input);
+    auto in = input;
+    auto one = builder->AddInitializer(name + "/one", *GetScalarConverter(1));
+    auto two = builder->AddInitializer(name + "/two", *GetScalarConverter(2));
+    auto e = builder->Exp(name + "/e", in);
     auto flow = builder->Add(name + "/e+2", e, two);
-    auto n = builder->Mul(name + "/n", e, flow);
-    flow = builder->Add(name + "/n+2", n, two);
-    auto d = builder->Div(name + "/d", input, flow);
-    auto f = builder->Mul(name + "/n*d", n, d);
-    flow = builder->Mul(name + "/2*d", d, two);
-    auto t = builder->Sub(name + "/in-2*d", input, flow);
-    flow = builder->Greater(name + "/compare", input, zero);
-    return builder->Where(name, flow, t, f);
+    flow = builder->Mul(name + "/e*e+2e", e, flow);
+    flow = builder->Div(name + "/2/(e*e+2e)", two, flow);
+    flow = builder->Add(name + "/1+2/(e*e+2e)", flow, one);
+    flow = builder->Div(name + "/in/(1+2/(e*e+2e))", in, flow);
+    return flow;
   }
 }
 
@@ -467,8 +462,10 @@ std::string Converter::MakeLayerNorm(OnnxBuilder* builder,
   if (!options_.alt_layernorm) {
     return builder->LayerNormalization(name, input, gammas, betas, 1, eps);
   }
-  auto in =
-      builder->Cast(name + "/to_float", input, pblczero::TensorProto::FLOAT);
+  auto in = input;
+  if (GetDataType() != pblczero::TensorProto::FLOAT) {
+    in = builder->Cast(name + "/to_float", in, pblczero::TensorProto::FLOAT);
+  }
   auto flow = builder->ReduceMean(name + "/mean", in, {1});
   in = builder->Sub(name + "/centered", in, flow);
   flow = builder->Mul(name + "/squared", in, in);
@@ -479,7 +476,9 @@ std::string Converter::MakeLayerNorm(OnnxBuilder* builder,
   flow = builder->Sqrt(name + "/std", flow);
   flow = builder->Reciprocal(name + "/inv_std", flow);
   flow = builder->Mul(name + "/normalized", in, flow);
-  flow = builder->Cast(name + "/to_data_type", flow, GetDataType());
+  if (GetDataType() != pblczero::TensorProto::FLOAT) {
+    flow = builder->Cast(name + "/to_data_type", flow, GetDataType());
+  }
   flow = builder->Mul(name + "/gammas", flow, gammas);
   flow = builder->Add(name + "/betas", flow, betas);
   return flow;
@@ -744,7 +743,7 @@ std::string Converter::MakeAttentionBody(OnnxBuilder* builder,
 
   if (weights.ip_mult_gate.size() > 0 || weights.ip_add_gate.size() > 0) {
     flow = builder->Reshape(
-        "/attn_body/ma_gating/rehape1", flow,
+        "/attn_body/ma_gating/rehape", flow,
         builder->AddInitializer("/const/ma_gating/shape1",
                                 Int64OnnxConst({-1, 64, embedding_size}, {3})));
     if (weights.ip_mult_gate.size() > 0) {
@@ -757,17 +756,23 @@ std::string Converter::MakeAttentionBody(OnnxBuilder* builder,
                           *GetWeghtsConverter(weights.ip_add_gate,
                                               {64, embedding_size}, {1, 0}));
     }
-    flow = builder->Reshape(
-        "/attn_body/ma_gating/rehape2", flow,
-        builder->AddInitializer("/const/ma_gating/shape2",
-                                Int64OnnxConst({-1, embedding_size}, {2})));
   }
 
+  flow = builder->Reshape(
+      "/attn_body/rehape", flow,
+      builder->AddInitializer("/const/ma_gating/shape2",
+                              Int64OnnxConst({-1, embedding_size}, {2})));
+
   float alpha = std::pow(2.0f * NumEncBlocks(), -0.25f);
 
   if (input_embedding == network_format::INPUT_EMBEDDING_PE_DENSE) {
-    flow = MakeFFN(builder, weights.ip_emb_ffn, embedding_size, flow,
-                   "/attn_body", default_activation_, alpha);
+    const auto ffn_activation = static_cast<ActivationFunction>(
+        src_.format().network_format().ffn_activation());
+    flow =
+        MakeFFN(builder, weights.ip_emb_ffn, embedding_size, flow, "/attn_body",
+                ffn_activation == ACTIVATION_DEFAULT ? default_activation_
+                                                     : ffn_activation,
+                alpha);
     flow = MakeLayerNorm(
         builder, flow, "/attn_body/ln2",
         *GetWeghtsConverter(weights.ip_emb_ffn_ln_gammas, {embedding_size}),
@@ -907,7 +912,7 @@ void Converter::MakePolicyHead(pblczero::OnnxModel* onnx, OnnxBuilder* builder,
                                const std::string& input,
                                const MultiHeadWeights& weights) {
   // Check that selected policy head exists.
-  if (weights.policy_heads.count(options_.policy_head) == 0) {
+  if (!weights.policy_heads.contains(options_.policy_head)) {
     throw Exception("The policy head you specified '" + options_.policy_head +
                     "'" + " does not exist in this net.");
   }
@@ -975,7 +980,7 @@ void Converter::MakeValueHead(pblczero::OnnxModel* onnx, OnnxBuilder* builder,
                               const std::string& input,
                               const MultiHeadWeights& weights) {
   // Check that selected value head exists.
-  if (weights.value_heads.count(options_.value_head) == 0) {
+  if (!weights.value_heads.contains(options_.value_head)) {
     throw Exception("The value head you specified '" + options_.value_head +
                     "'" + " does not exist in this net.");
   }
@@ -1019,9 +1024,11 @@ void Converter::MakeValueHead(pblczero::OnnxModel* onnx, OnnxBuilder* builder,
                         *GetWeghtsConverter(head.ip2_val_w, {128, 3}, {1, 0}));
     flow = builder->Add("/value/dense2/add", flow,
                         *GetWeghtsConverter(head.ip2_val_b, {3}));
-    auto output = builder->Softmax(options_.output_wdl, flow);
-    builder->AddOutput(output, {options_.batch_size, 3}, GetDataType());
-    onnx->set_output_wdl(output);
+    if (!options_.no_wdl_softmax) {
+      flow = builder->Softmax(options_.output_wdl, flow);
+    }
+    builder->AddOutput(flow, {options_.batch_size, 3}, GetDataType());
+    onnx->set_output_wdl(flow);
   } else {
     flow =
         builder->MatMul("/value/dense2/matmul", flow,
@@ -1078,15 +1085,15 @@ void Converter::MakeMovesLeftHead(pblczero::OnnxModel* onnx,
       *GetWeghtsConverter(weights.ip2_mov_w, {mlh_fc1_outputs, 1}, {1, 0}));
   flow = builder->Add("/mlh/dense2/add", flow,
                       *GetWeghtsConverter(weights.ip2_mov_b, {1}));
-  flow = MakeActivation(builder, flow, "/mlh/dense2", default_activation_);
-  auto output = builder->Identity(options_.output_mlh, flow);
+  // Explicity ReLU activation.
+  auto output = builder->Relu(options_.output_mlh, flow);
   builder->AddOutput(output, {options_.batch_size, 1}, GetDataType());
   onnx->set_output_mlh(output);
 }
 
 void Converter::GenerateOnnx(pblczero::OnnxModel* onnx) {
   MultiHeadWeights weights(src_.weights());
-  OnnxBuilder builder(options_.opset);
+  OnnxBuilder builder(options_.opset, options_.ir);
 
   if (GetDataType() == pblczero::TensorProto::FLOAT16) {
     onnx->set_data_type(pblczero::OnnxModel::FLOAT16);
diff --git a/src/neural/onnx/converter.h b/src/neural/onnx/converter.h
index 632f65c94b..e6c768aad9 100644
--- a/src/neural/onnx/converter.h
+++ b/src/neural/onnx/converter.h
@@ -29,8 +29,8 @@
 
 #include <string>
 
-#include "neural/onnx/onnx.pb.h"
 #include "proto/net.pb.h"
+#include "proto/onnx.pb.h"
 
 namespace lczero {
 
@@ -45,9 +45,12 @@ struct WeightsToOnnxConverterOptions {
   std::string output_mlh = "/output/mlh";
   int batch_size = -1;
   int opset = 17;
-  bool alt_mish = false;       // Use "Mish" approximation (fp32 only).
+  int ir = -1;                 // ONNX IR, -1 for auto.
+  bool alt_mish = false;       // Use "Mish" approximation.
+  bool real_mish = true;       // Use "Mish" operator (opset 18+ and !alt_mish).
   bool alt_layernorm = false;  // Discrete "LayerNormalization" implementation.
   bool no_shape = false;       // Avoid use of "Shape" operator.
+  bool no_wdl_softmax = false; // Skip wdl softmax.
   std::string policy_head = "vanilla";
   std::string value_head = "winner";
 
diff --git a/src/neural/onnx/network_onnx.cc b/src/neural/onnx/network_onnx.cc
deleted file mode 100644
index fe7d1a436e..0000000000
--- a/src/neural/onnx/network_onnx.cc
+++ /dev/null
@@ -1,457 +0,0 @@
-/*
-  This file is part of Leela Chess Zero.
-  Copyright (C) 2021-2023 The LCZero Authors
-
-  Leela Chess is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  Leela Chess is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
-
-  Additional permission under GNU GPL version 3 section 7
-
-  If you modify this Program, or any covered work, by linking or
-  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
-  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
-  modified version of those libraries), containing parts covered by the
-  terms of the respective license agreement, the licensors of this
-  Program grant you additional permission to convey the resulting work.
-*/
-
-#include <algorithm>
-#include <cassert>
-#include <fstream>
-#include <iterator>
-#include <memory>
-#include <string>
-#include <vector>
-
-#if __has_include("dml_provider_factory.h")
-#include "dml_provider_factory.h"
-#define USE_DML
-#endif
-
-#include "cpu_provider_factory.h"
-#include "neural/factory.h"
-#include "neural/loader.h"
-#include "neural/network.h"
-#include "neural/onnx/converter.h"
-#include "onnxruntime_cxx_api.h"
-#include "utils/bf16_utils.h"
-#include "utils/bititer.h"
-#include "utils/exception.h"
-#include "utils/fp16_utils.h"
-#include "utils/logging.h"
-
-namespace lczero {
-namespace {
-
-enum class OnnxProvider { CPU, CUDA, DML, ROCM };
-
-class OnnxNetwork;
-
-template <typename DataType>
-class OnnxComputation : public NetworkComputation {
- public:
-  OnnxComputation(OnnxNetwork* network);
-  void AddInput(InputPlanes&& input) override;
-  int GetBatchSize() const override { return raw_input_.size(); }
-  void ComputeBlocking() override;
-  float GetQVal(int sample) const override;
-  float GetDVal(int sample) const override;
-  float GetPVal(int sample, int move_id) const override;
-  float GetMVal(int sample) const override;
-
- private:
-  Ort::Value PrepareInputs(int start, int batch_size);
-
-  OnnxNetwork* network_;
-  std::vector<InputPlanes> raw_input_;
-  std::vector<DataType> input_tensor_data_;
-  std::vector<Ort::Value> output_tensors_;
-  std::vector<std::vector<DataType>> output_tensors_data_;
-  std::vector<size_t> output_tensors_step_;
-};
-
-class OnnxNetwork : public Network {
- public:
-  OnnxNetwork(const WeightsFile& file, const OptionsDict& options,
-              OnnxProvider provider, int gpu, int threads, int batch_size,
-              int steps);
-  std::unique_ptr<NetworkComputation> NewComputation() override {
-    if (fp16_) {
-      return std::make_unique<OnnxComputation<Ort::Float16_t>>(this);
-    } else if (bf16_) {
-      return std::make_unique<OnnxComputation<Ort::BFloat16_t>>(this);
-    } else {
-      return std::make_unique<OnnxComputation<float>>(this);
-    }
-  }
-  const NetworkCapabilities& GetCapabilities() const override {
-    return capabilities_;
-  }
-  int GetMiniBatchSize() const override {
-    return batch_size_ == -1 ? Network::GetMiniBatchSize()
-                             : batch_size_ * steps_;
-  }
-  bool IsCpu() const override { return provider_ == OnnxProvider::CPU; }
-
-  Ort::Env onnx_env_;
-  // Prepare sessions for this many multiples of the batch size;
-  int steps_;
-  std::vector<Ort::Session> session_;
-  std::vector<std::string> inputs_;
-  // Points to strings in inputs_.
-  std::vector<const char*> inputs_cstr_;
-  std::vector<std::string> outputs_;
-  // Points to strings in outputs_.
-  std::vector<const char*> outputs_cstr_;
-  // Indices in output_cstr_ vector.
-  int policy_head_ = -1;
-  int wdl_head_ = -1;
-  int value_head_ = -1;
-  int mlh_head_ = -1;
-  NetworkCapabilities capabilities_;
-  bool fp16_;
-  bool bf16_;
-  // The batch size to use, or -1 for variable.
-  int batch_size_;
-  static constexpr int max_batch_size_ = 1024;
-  // For conditional locking if running the DML provider.
-  OnnxProvider provider_;
-  std::mutex lock_;
-};
-
-template <typename DataType>
-OnnxComputation<DataType>::OnnxComputation(OnnxNetwork* network)
-    : network_(network) {
-  output_tensors_data_.resize(network_->outputs_.size());
-  output_tensors_step_.resize(network_->outputs_.size());
-  output_tensors_step_[network_->policy_head_] = 1858;
-  output_tensors_data_[network_->policy_head_] =
-      std::vector<DataType>(1858 * network_->max_batch_size_);
-  if (network_->wdl_head_ != -1) {
-    output_tensors_step_[network_->wdl_head_] = 3;
-    output_tensors_data_[network_->wdl_head_] =
-        std::vector<DataType>(3 * network_->max_batch_size_);
-  }
-  if (network_->value_head_ != -1) {
-    output_tensors_step_[network_->value_head_] = 1;
-    output_tensors_data_[network_->value_head_] =
-        std::vector<DataType>(network_->max_batch_size_);
-  }
-  if (network_->mlh_head_ != -1) {
-    output_tensors_step_[network_->mlh_head_] = 1;
-    output_tensors_data_[network_->mlh_head_] =
-        std::vector<DataType>(network_->max_batch_size_);
-  }
-}
-
-template <typename DataType>
-void OnnxComputation<DataType>::AddInput(InputPlanes&& input) {
-  raw_input_.emplace_back(input);
-  if (raw_input_.size() > network_->max_batch_size_) {
-    throw Exception("NN input exceeds max batch size of " +
-                    std::to_string(network_->max_batch_size_) + ".");
-  }
-}
-
-float AsFloat(float x) { return x; }
-float AsFloat(Ort::Float16_t x) {
-  uint16_t tmp;
-  std::memcpy(&tmp, reinterpret_cast<uint16_t*>(&x), sizeof(uint16_t));
-  return FP16toFP32(tmp);
-}
-float AsFloat(Ort::BFloat16_t x) {
-  uint16_t tmp;
-  std::memcpy(&tmp, reinterpret_cast<uint16_t*>(&x), sizeof(uint16_t));
-  return BF16toFP32(tmp);
-}
-
-template <typename DataType>
-float OnnxComputation<DataType>::GetQVal(int sample) const {
-  if (network_->wdl_head_ != -1) {
-    const auto& data = output_tensors_data_[network_->wdl_head_];
-    return AsFloat(data[sample * 3 + 0]) - AsFloat(data[sample * 3 + 2]);
-  } else {
-    const auto& data = output_tensors_data_[network_->value_head_];
-    return AsFloat(data[sample]);
-  }
-}
-
-template <typename DataType>
-float OnnxComputation<DataType>::GetDVal(int sample) const {
-  if (network_->wdl_head_ == -1) return 0.0f;
-  const auto& data = output_tensors_data_[network_->wdl_head_];
-  return AsFloat(data[sample * 3 + 1]);
-}
-
-template <typename DataType>
-float OnnxComputation<DataType>::GetPVal(int sample, int move_id) const {
-  const auto& data = output_tensors_data_[network_->policy_head_];
-  return AsFloat(data[sample * 1858 + move_id]);
-}
-
-template <typename DataType>
-float OnnxComputation<DataType>::GetMVal(int sample) const {
-  if (network_->mlh_head_ == -1) return 0.0f;
-  const auto& data = output_tensors_data_[network_->mlh_head_];
-  return AsFloat(data[sample]);
-}
-
-void AsDataType(float x, float* y) { *y = x; }
-void AsDataType(float x, Ort::Float16_t* y) {
-  uint16_t tmp = FP32toFP16(x);
-  std::memcpy(reinterpret_cast<uint16_t*>(y), &tmp, sizeof(uint16_t));
-}
-void AsDataType(float x, Ort::BFloat16_t* y) {
-  uint16_t tmp = FP32toBF16(x);
-  std::memcpy(reinterpret_cast<uint16_t*>(y), &tmp, sizeof(uint16_t));
-}
-
-template <typename DataType>
-Ort::Value OnnxComputation<DataType>::PrepareInputs(int start, int batch_size) {
-  input_tensor_data_.clear();
-  input_tensor_data_.resize(batch_size * kInputPlanes * 8 * 8);
-  auto iter = input_tensor_data_.data();
-  int end = std::min(start + batch_size, static_cast<int>(raw_input_.size()));
-  for (int i = start; i < end; i++) {
-    for (const auto& plane : raw_input_[i]) {
-      DataType value;
-      AsDataType(plane.value, &value);
-      for (auto bit : IterateBits(plane.mask)) {
-        *(iter + bit) = value;
-      }
-      iter += 64;
-    }
-  }
-  for (int i = end; i < start + batch_size; i++) {
-    for (int j = 0; j < kInputPlanes * 64; j++) {
-      *iter++ = DataType();
-    }
-  }
-
-  auto memory_info =
-      Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
-
-  output_tensors_.clear();
-  for (size_t i = 0; i < output_tensors_step_.size(); i++) {
-    int size = output_tensors_step_[i];
-    int64_t dims[] = {batch_size, size};
-    output_tensors_.emplace_back(Ort::Value::CreateTensor<DataType>(
-        memory_info, output_tensors_data_[i].data() + start * size,
-        size * batch_size, dims, 2));
-  }
-
-  int64_t dims[] = {batch_size, kInputPlanes, 8, 8};
-  return Ort::Value::CreateTensor<DataType>(memory_info,
-                                            input_tensor_data_.data(),
-                                            input_tensor_data_.size(), dims, 4);
-}
-
-template <typename DataType>
-void OnnxComputation<DataType>::ComputeBlocking() {
-  int batch_size = network_->batch_size_;
-  if (batch_size < 0) batch_size = raw_input_.size();
-
-  for (size_t i = 0; i < raw_input_.size();) {
-    int step = (raw_input_.size() - i + batch_size - 1) / batch_size;
-    if (step > network_->steps_) step = network_->steps_;
-    int batch = batch_size * step;
-
-    auto input_tensor = PrepareInputs(i, batch);
-    // The DML onnxruntime execution provider is documented as not supporting
-    // multi-threaded calls to Run on the same inference session. We found the
-    // same to be true for the ROCm execution provider (at least for CNNs).
-    // TODO: This may be a onnxruntime/ROCm bug, check onnxruntime 1.16 release.
-    if (network_->provider_ == OnnxProvider::DML ||
-        network_->provider_ == OnnxProvider::ROCM) {
-      network_->lock_.lock();
-    }
-    network_->session_[step - 1].Run(
-        {}, network_->inputs_cstr_.data(), &input_tensor, 1,
-        network_->outputs_cstr_.data(), output_tensors_.data(),
-        output_tensors_.size());
-    if (network_->provider_ == OnnxProvider::DML ||
-        network_->provider_ == OnnxProvider::ROCM) {
-      network_->lock_.unlock();
-    }
-    i += batch;
-  }
-}
-
-Ort::SessionOptions GetOptions(OnnxProvider provider, int gpu, int threads,
-                               int batch_size) {
-  Ort::SessionOptions options;
-  options.SetIntraOpNumThreads(threads);
-  options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
-
-  if (batch_size > 0) {
-    // Override the default (variable) batch size.
-    Ort::ThrowOnError(
-        OrtGetApiBase()
-            ->GetApi(ORT_API_VERSION)
-            ->AddFreeDimensionOverrideByName(options, "batch", batch_size));
-  }
-
-  switch (provider) {
-    case OnnxProvider::DML:
-      options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
-      options.DisableMemPattern();
-#ifdef USE_DML
-      Ort::ThrowOnError(
-          OrtSessionOptionsAppendExecutionProvider_DML(options, gpu));
-#else
-      throw Exception("ONNX backend internal error.");
-#endif
-      break;
-    case OnnxProvider::ROCM: {
-      OrtROCMProviderOptions rocm_options;
-      rocm_options.device_id = gpu;
-      options.AppendExecutionProvider_ROCM(rocm_options);
-      break;
-    }
-    case OnnxProvider::CUDA: {
-      OrtCUDAProviderOptions cuda_options;
-      cuda_options.device_id = gpu;
-      options.AppendExecutionProvider_CUDA(cuda_options);
-      break;
-    }
-    case OnnxProvider::CPU:
-      auto status = OrtSessionOptionsAppendExecutionProvider_CPU(options, 0);
-      if (status) {
-        std::string error_message = Ort::GetApi().GetErrorMessage(status);
-        OrtErrorCode error_code = Ort::GetApi().GetErrorCode(status);
-        Ort::GetApi().ReleaseStatus(status);
-        throw Exception("ONNX CPU error " + std::to_string(error_code) + ": " +
-                        error_message);
-      }
-      break;
-  }
-  return options;
-}
-
-OnnxNetwork::OnnxNetwork(const WeightsFile& file, const OptionsDict&,
-                         OnnxProvider provider, int gpu, int threads,
-                         int batch_size, int steps)
-    : onnx_env_(ORT_LOGGING_LEVEL_WARNING, "lc0"),
-      steps_(steps),
-      capabilities_{file.format().network_format().input(),
-                    file.format().network_format().output(),
-                    file.format().network_format().moves_left()},
-      fp16_(file.onnx_model().data_type() == pblczero::OnnxModel::FLOAT16),
-      bf16_(file.onnx_model().data_type() == pblczero::OnnxModel::BFLOAT16),
-      batch_size_(batch_size),
-      provider_(provider) {
-  // Sanity checks.
-  if (batch_size_ < 0) steps_ = 1;
-  if (batch_size_ * steps > max_batch_size_) {
-    batch_size_ = max_batch_size_ / steps_;
-  }
-
-  for (int step = 1; step <= steps_; step++)
-    session_.emplace_back(
-        onnx_env_, file.onnx_model().model().data(),
-        file.onnx_model().model().size(),
-        GetOptions(provider, gpu, threads, batch_size_ * step));
-
-  const auto& md = file.onnx_model();
-  if (!md.has_input_planes()) {
-    throw Exception("NN doesn't have input planes defined.");
-  }
-  inputs_.emplace_back(md.input_planes());
-  if (!md.has_output_policy()) {
-    throw Exception("NN doesn't have policy head defined.");
-  }
-  policy_head_ = outputs_.size();
-  outputs_.emplace_back(md.output_policy());
-  if (md.has_output_wdl()) {
-    wdl_head_ = outputs_.size();
-    outputs_.emplace_back(md.output_wdl());
-  } else if (md.has_output_value()) {
-    value_head_ = outputs_.size();
-    outputs_.emplace_back(md.output_value());
-  } else {
-    throw Exception("NN doesn't have value head.");
-  }
-  if (md.has_output_mlh()) {
-    mlh_head_ = outputs_.size();
-    outputs_.emplace_back(md.output_mlh());
-  }
-  std::transform(inputs_.begin(), inputs_.end(),
-                 std::back_inserter(inputs_cstr_),
-                 [](const auto& x) { return x.c_str(); });
-  std::transform(outputs_.begin(), outputs_.end(),
-                 std::back_inserter(outputs_cstr_),
-                 [](const auto& x) { return x.c_str(); });
-}
-
-template <OnnxProvider kProvider>
-std::unique_ptr<Network> MakeOnnxNetwork(const std::optional<WeightsFile>& w,
-                                         const OptionsDict& opts) {
-  if (!w) throw Exception("The ONNX backend requires a network file.");
-
-  int gpu = opts.GetOrDefault<int>("gpu", 0);
-
-  int batch_size =
-      opts.GetOrDefault<int>("batch", kProvider == OnnxProvider::DML ? 16 : -1);
-
-  int steps =
-      opts.GetOrDefault<int>("steps", kProvider == OnnxProvider::DML ? 4 : 1);
-
-  int threads =
-      opts.GetOrDefault<int>("threads", kProvider == OnnxProvider::CPU ? 1 : 0);
-
-  if (batch_size <= 0) batch_size = -1;  // Variable batch size.
-
-  if (w->has_onnx_model()) {
-    return std::make_unique<OnnxNetwork>(*w, opts, kProvider, gpu, threads,
-                                         batch_size, steps);
-  } else {
-    WeightsToOnnxConverterOptions converter_options;
-    converter_options.opset = opts.GetOrDefault<int>("opset", 17);
-    converter_options.alt_mish = opts.GetOrDefault<bool>(
-        "alt_mish", kProvider == OnnxProvider::CPU ? true : false);
-    converter_options.alt_layernorm = opts.GetOrDefault<bool>(
-        "alt_layernorm", kProvider == OnnxProvider::DML ? true : false);
-    converter_options.no_shape = opts.GetOrDefault<bool>("no_shape", false);
-    converter_options.policy_head =
-        opts.GetOrDefault<std::string>("policy_head", "vanilla");
-    converter_options.value_head =
-        opts.GetOrDefault<std::string>("value_head", "winner");
-
-    std::string datatype;
-    if (opts.IsDefault<std::string>("datatype")) {
-      bool fp16 = opts.GetOrDefault<bool>(
-          "fp16", kProvider == OnnxProvider::CPU ? false : true);
-      datatype = fp16 ? "f16" : "f32";
-    } else {
-      datatype = opts.Get<std::string>("datatype");
-    }
-    converter_options.data_type =
-        WeightsToOnnxConverterOptions::StringToDataType(datatype);
-
-    auto converted = ConvertWeightsToOnnx(*w, converter_options);
-    return std::make_unique<OnnxNetwork>(converted, opts, kProvider, gpu,
-                                         threads, batch_size, steps);
-  }
-}
-
-#ifdef USE_ROCM
-REGISTER_NETWORK("onnx-rocm", MakeOnnxNetwork<OnnxProvider::ROCM>, 64)
-#endif
-#ifdef USE_DML
-REGISTER_NETWORK("onnx-dml", MakeOnnxNetwork<OnnxProvider::DML>, 63)
-#endif
-REGISTER_NETWORK("onnx-cuda", MakeOnnxNetwork<OnnxProvider::CUDA>, 61)
-REGISTER_NETWORK("onnx-cpu", MakeOnnxNetwork<OnnxProvider::CPU>, 62)
-
-}  // namespace
-}  // namespace lczero
diff --git a/src/neural/register.cc b/src/neural/register.cc
new file mode 100644
index 0000000000..2a61e99352
--- /dev/null
+++ b/src/neural/register.cc
@@ -0,0 +1,99 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2024 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include "neural/register.h"
+
+#include <algorithm>
+
+#include "default_backend.h"
+#include "neural/shared_params.h"
+
+namespace lczero {
+
+BackendManager* BackendManager::Get() {
+  static BackendManager instance;
+  return &instance;
+}
+
+std::vector<std::string> BackendManager::GetBackendNames() const {
+  std::vector<std::pair<int, std::string>> priority_and_names;
+  std::transform(algorithms_.begin(), algorithms_.end(),
+                 std::back_inserter(priority_and_names),
+                 [](const std::unique_ptr<BackendFactory>& factory) {
+                   return std::make_pair(factory->GetPriority(),
+                                         std::string(factory->GetName()));
+                 });
+  std::sort(priority_and_names.begin(), priority_and_names.end(),
+            std::greater<>());
+  std::vector<std::string> result;
+  std::transform(priority_and_names.begin(), priority_and_names.end(),
+                 std::back_inserter(result),
+                 [](const std::pair<int, std::string>& p) { return p.second; });
+#ifdef DEFAULT_BACKEND
+  std::string name = DEFAULT_BACKEND;
+  auto pos = std::find(result.begin(), result.end(), name);
+  if (pos == result.end()) throw Exception("Unknown backend: " + name);
+  std::rotate(result.begin(), pos, pos + 1);
+#endif
+  return result;
+}
+
+BackendFactory* BackendManager::GetFactoryByName(std::string_view name) const {
+  auto iter =
+      std::find_if(algorithms_.begin(), algorithms_.end(),
+                   [name](const std::unique_ptr<BackendFactory>& factory) {
+                     return factory->GetName() == name;
+                   });
+  return iter == algorithms_.end() ? nullptr : iter->get();
+}
+
+std::unique_ptr<Backend> BackendManager::CreateFromParams(
+    const OptionsDict& options) const {
+  const std::string backend =
+      options.Get<std::string>(SharedBackendParams::kBackendId);
+  return CreateFromName(backend, options);
+}
+
+std::unique_ptr<Backend> BackendManager::CreateFromName(
+    std::string_view name, const OptionsDict& options) const {
+  BackendFactory* factory = GetFactoryByName(name);
+  if (!factory) throw Exception("Unknown backend: " + std::string(name));
+  return factory->Create(options);
+}
+
+void BackendManager::RemoveBackend(const BackendFactory* factory) {
+  auto iter = std::find_if(algorithms_.begin(), algorithms_.end(),
+                           [factory](const std::unique_ptr<BackendFactory>& f) {
+                             return f.get() == factory;
+                           });
+  if (iter == algorithms_.end()) {
+    throw Exception("Attempt to remove unregistered backend");
+  }
+  algorithms_.erase(iter);
+}
+
+}  // namespace lczero
\ No newline at end of file
diff --git a/src/neural/register.h b/src/neural/register.h
new file mode 100644
index 0000000000..db30ef2a52
--- /dev/null
+++ b/src/neural/register.h
@@ -0,0 +1,79 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2024 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include <memory>
+#include <vector>
+
+#include "neural/backend.h"
+
+#pragma once
+
+namespace lczero {
+
+class BackendManager {
+ public:
+  static BackendManager* Get();
+  void AddBackend(std::unique_ptr<BackendFactory> factory) {
+    algorithms_.push_back(std::move(factory));
+  }
+  // Removes the backend factory from the list. Currently only used in tests.
+  void RemoveBackend(const BackendFactory* factory);
+
+  // Returns list of backend names, sorted by priority (higher priority first).
+  std::vector<std::string> GetBackendNames() const;
+
+  // Creates a backend from the parameters. Extracts the weights file and the
+  // backend from the options.
+  std::unique_ptr<Backend> CreateFromParams(const OptionsDict& options) const;
+
+  // Creates a backend from the name. Backend name from the options is ignored.
+  // Note that unlike the WeightsFactory, the "options" parameter contains
+  // top-level parameters rather than `backend-opts`.
+  std::unique_ptr<Backend> CreateFromName(std::string_view name,
+                                          const OptionsDict& options) const;
+
+  // Returns a backend factory by name. Returns nullptr if not found.
+  BackendFactory* GetFactoryByName(std::string_view name) const;
+
+  struct Register {
+    Register(std::unique_ptr<BackendFactory> factory) {
+      BackendManager::Get()->AddBackend(std::move(factory));
+    }
+  };
+
+ private:
+  BackendManager() = default;
+
+  std::vector<std::unique_ptr<BackendFactory>> algorithms_;
+};
+
+#define REGISTER_BACKEND(factory)                                    \
+  namespace {                                                        \
+  [[maybe_unused]] static SearchFactory::Register reg29c93##factory( \
+      std::make_unique<factory>());                                  \
+  }
+}  // namespace lczero
diff --git a/src/neural/shared_params.cc b/src/neural/shared_params.cc
new file mode 100644
index 0000000000..ba5d460f34
--- /dev/null
+++ b/src/neural/shared_params.cc
@@ -0,0 +1,86 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2024 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include "neural/shared_params.h"
+
+#include "neural/register.h"
+
+namespace lczero {
+const OptionId SharedBackendParams::kPolicySoftmaxTemp{
+    "policy-softmax-temp", "PolicyTemperature",
+    "Policy softmax temperature. Higher values make priors of move candidates "
+    "closer to each other, widening the search."};
+const OptionId SharedBackendParams::kHistoryFill{
+    "history-fill-new", "HistoryFill",
+    "Neural network uses 7 previous board positions in addition to the current "
+    "one. During the first moves of the game such historical positions don't "
+    "exist, but they can be synthesized. This parameter defines when to "
+    "synthesize them (always, never, or only at non-standard fen position)."};
+const OptionId SharedBackendParams::kWeightsId{
+    {.long_flag = "weights",
+     .uci_option = "WeightsFile",
+     .help_text =
+         "Path from which to load network weights.\nSetting it to "
+         "<autodiscover> makes it search in ./ and ./weights/ subdirectories "
+         "for the latest (by file date) file which looks like weights.",
+     .short_flag = 'w',
+     .visibility = OptionId::kAlwaysVisible}};
+const OptionId SharedBackendParams::kBackendId{{
+    .long_flag = "backend",
+    .uci_option = "Backend",
+    .help_text = "Neural network computational backend to use.",
+    .short_flag = 'b',
+}};
+const OptionId SharedBackendParams::kBackendOptionsId{
+    "backend-opts", "BackendOptions",
+    "Parameters of neural network backend. Exact parameters differ per "
+    "backend.",
+    'o'};
+const OptionId SharedBackendParams::kNNCacheSizeId{
+    "nncache", "NNCacheSize",
+    "Number of positions to store in a memory cache. A large cache can speed "
+    "up searching, but takes memory."};
+
+void SharedBackendParams::Populate(OptionsParser* options) {
+  options->Add<FloatOption>(kPolicySoftmaxTemp, 0.1f, 10.0f) = 1.359f;
+  std::vector<std::string> history_fill_opt{"no", "fen_only", "always"};
+  options->Add<ChoiceOption>(kHistoryFill, history_fill_opt) = "fen_only";
+
+#if defined(EMBED)
+  options->Add<StringOption>(SharedBackendParams::kWeightsId) = kEmbed;
+#else
+  options->Add<StringOption>(SharedBackendParams::kWeightsId) = kAutoDiscover;
+#endif
+  const auto backends = BackendManager::Get()->GetBackendNames();
+  options->Add<ChoiceOption>(SharedBackendParams::kBackendId, backends) =
+      backends.empty() ? "<none>" : backends[0];
+  options->Add<StringOption>(SharedBackendParams::kBackendOptionsId);
+  options->Add<IntOption>(SharedBackendParams::kNNCacheSizeId, 0, 999999999) =
+      2000000;
+}
+
+}  // namespace lczero
\ No newline at end of file
diff --git a/src/neural/shared_params.h b/src/neural/shared_params.h
new file mode 100644
index 0000000000..7a5a8c1b68
--- /dev/null
+++ b/src/neural/shared_params.h
@@ -0,0 +1,54 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2024 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#pragma once
+
+#include "utils/optionsdict.h"
+#include "utils/optionsparser.h"
+
+namespace lczero {
+
+// Backend parameters that appear in UCI interface and are in use by most
+// backends.
+struct SharedBackendParams {
+  static const constexpr char* kEmbed = "<built in>";
+  static const constexpr char* kAutoDiscover = "<autodiscover>";
+
+  static const OptionId kPolicySoftmaxTemp;
+  static const OptionId kHistoryFill;
+  static const OptionId kWeightsId;
+  static const OptionId kBackendId;
+  static const OptionId kBackendOptionsId;
+  static const OptionId kNNCacheSizeId;
+
+  static void Populate(OptionsParser*);
+
+ private:
+  SharedBackendParams() = delete;
+};
+
+}  // namespace lczero
\ No newline at end of file
diff --git a/src/neural/tables/activation_function.h b/src/neural/tables/activation_function.h
new file mode 100644
index 0000000000..56a35f5608
--- /dev/null
+++ b/src/neural/tables/activation_function.h
@@ -0,0 +1,43 @@
+/*
+ This file is part of Leela Chess Zero.
+ Copyright (C) 2018-2022 The LCZero Authors
+
+ Leela Chess is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Leela Chess is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+namespace lczero {
+// The following list matches the one in net.proto. Ideally this would be done
+// by including proto/net.pb.h, but this is incompatible with nvcc.
+enum ActivationFunction {
+  ACTIVATION_DEFAULT = 0,
+  ACTIVATION_MISH = 1,
+  ACTIVATION_RELU = 2,
+  ACTIVATION_NONE = 3,
+  ACTIVATION_TANH = 4,
+  ACTIVATION_SIGMOID = 5,
+  ACTIVATION_SELU = 6,
+  ACTIVATION_SWISH = 7,
+  ACTIVATION_RELU_2 = 8,
+  ACTIVATION_SOFTMAX = 9,
+};
+
+struct Activations {
+    ActivationFunction default_activation = ACTIVATION_RELU;
+    ActivationFunction smolgen_activation = ACTIVATION_SWISH;
+    ActivationFunction ffn_activation = ACTIVATION_RELU_2;
+};
+
+}  // namespace lczero
diff --git a/src/neural/shared/attention_policy_map.h b/src/neural/tables/attention_policy_map.h
similarity index 100%
rename from src/neural/shared/attention_policy_map.h
rename to src/neural/tables/attention_policy_map.h
diff --git a/src/neural/shared/policy_map.h b/src/neural/tables/policy_map.h
similarity index 100%
rename from src/neural/shared/policy_map.h
rename to src/neural/tables/policy_map.h
diff --git a/src/neural/wrapper.cc b/src/neural/wrapper.cc
new file mode 100644
index 0000000000..11d9dac78f
--- /dev/null
+++ b/src/neural/wrapper.cc
@@ -0,0 +1,198 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2024 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include "neural/wrapper.h"
+
+#include <algorithm>
+#include <numeric>
+
+#include "neural/encoder.h"
+#include "neural/shared_params.h"
+#include "utils/atomic_vector.h"
+#include "utils/fastmath.h"
+#include "utils/trace.h"
+
+namespace lczero {
+namespace {
+
+FillEmptyHistory EncodeHistoryFill(std::string history_fill) {
+  if (history_fill == "fen_only") return FillEmptyHistory::FEN_ONLY;
+  if (history_fill == "always") return FillEmptyHistory::ALWAYS;
+  assert(history_fill == "no");
+  return FillEmptyHistory::NO;
+}
+
+class NetworkAsBackend : public Backend {
+ public:
+  NetworkAsBackend(std::unique_ptr<Network> network, const OptionsDict& options)
+      : network_(std::move(network)),
+        backend_opts_(
+            options.Get<std::string>(SharedBackendParams::kBackendOptionsId)),
+        weights_path_(
+            options.Get<std::string>(SharedBackendParams::kWeightsId)) {
+    UpdateConfiguration(options);
+    const NetworkCapabilities& caps = network_->GetCapabilities();
+    attrs_.has_mlh = caps.has_mlh();
+    attrs_.has_wdl = caps.has_wdl();
+    attrs_.runs_on_cpu = network_->IsCpu();
+    attrs_.suggested_num_search_threads = network_->GetThreads();
+    attrs_.recommended_batch_size = network_->GetMiniBatchSize();
+    attrs_.maximum_batch_size = 1024;
+    input_format_ = caps.input_format;
+  }
+
+  BackendAttributes GetAttributes() const override { return attrs_; }
+  std::unique_ptr<BackendComputation> CreateComputation() override;
+  UpdateConfigurationResult UpdateConfiguration(
+      const OptionsDict& options) override {
+    Backend::UpdateConfiguration(options);
+    if (backend_opts_ !=
+        options.Get<std::string>(SharedBackendParams::kBackendOptionsId)) {
+      return NEED_RESTART;
+    }
+    if (weights_path_ !=
+        options.Get<std::string>(SharedBackendParams::kWeightsId)) {
+      return NEED_RESTART;
+    }
+    softmax_policy_temperature_ =
+        1.0f / options.Get<float>(SharedBackendParams::kPolicySoftmaxTemp);
+    fill_empty_history_ = EncodeHistoryFill(
+        options.Get<std::string>(SharedBackendParams::kHistoryFill));
+    return UPDATE_OK;
+  }
+
+ private:
+  std::unique_ptr<Network> network_;
+  BackendAttributes attrs_;
+  pblczero::NetworkFormat::InputFormat input_format_;
+  float softmax_policy_temperature_;
+  FillEmptyHistory fill_empty_history_;
+  const std::string backend_opts_;
+  const std::string weights_path_;
+
+  friend class NetworkAsBackendComputation;
+};
+
+class NetworkAsBackendComputation : public BackendComputation {
+ public:
+  NetworkAsBackendComputation(NetworkAsBackend* backend)
+      : backend_(backend),
+        computation_(backend_->network_->NewComputation()),
+        entries_(backend_->attrs_.maximum_batch_size) {}
+
+  size_t UsedBatchSize() const override { return entries_.size(); }
+
+  AddInputResult AddInput(const EvalPosition& pos,
+                          EvalResultPtr result) override {
+    int transform;
+    const size_t idx = entries_.emplace_back(Entry{
+        .input = EncodePositionForNN(backend_->input_format_, pos.pos, 8,
+                                     backend_->fill_empty_history_, &transform),
+        .legal_moves = MoveList(pos.legal_moves.begin(), pos.legal_moves.end()),
+        .result = result,
+        .transform = 0});
+    entries_[idx].transform = transform;
+    return ENQUEUED_FOR_EVAL;
+  }
+
+  void ComputeBlocking() override {
+    for (auto& entry : entries_) computation_->AddInput(std::move(entry.input));
+    computation_->ComputeBlocking();
+    LCTRACE_FUNCTION_SCOPE;
+    for (size_t i = 0; i < entries_.size(); ++i) {
+      const EvalResultPtr& result = entries_[i].result;
+      if (result.q) *result.q = computation_->GetQVal(i);
+      if (result.d) *result.d = computation_->GetDVal(i);
+      if (result.m) *result.m = computation_->GetMVal(i);
+      if (!result.p.empty()) SoftmaxPolicy(result.p, computation_.get(), i);
+    }
+  }
+
+  void SoftmaxPolicy(std::span<float> dst,
+                     const NetworkComputation* computation, int idx) {
+    LCTRACE_FUNCTION_SCOPE;
+    const std::vector<Move>& moves = entries_[idx].legal_moves;
+    const int transform = entries_[idx].transform;
+    // Copy the values to the destination array and compute the maximum.
+    const float max_p = std::accumulate(
+        moves.begin(), moves.end(), -std::numeric_limits<float>::infinity(),
+        [&, counter = 0](float max_p, const Move& move) mutable {
+          return std::max(max_p, dst[counter++] = computation->GetPVal(
+                                     idx, MoveToNNIndex(move, transform)));
+        });
+    // Compute the softmax and compute the total.
+    const float temperature = backend_->softmax_policy_temperature_;
+    float total = std::accumulate(
+        dst.begin(), dst.end(), 0.0f, [&](float total, float& val) {
+          return total + (val = FastExp((val - max_p) * temperature));
+        });
+    const float scale = total > 0.0f ? 1.0f / total : 1.0f;
+    // Scale the values to sum to 1.0.
+    std::for_each(dst.begin(), dst.end(), [&](float& val) { val *= scale; });
+  }
+
+ private:
+  struct Entry {
+    InputPlanes input;
+    MoveList legal_moves;
+    EvalResultPtr result;
+    int transform;
+  };
+
+  NetworkAsBackend* backend_;
+  std::unique_ptr<NetworkComputation> computation_;
+  AtomicVector<Entry> entries_;
+};
+
+std::unique_ptr<BackendComputation> NetworkAsBackend::CreateComputation() {
+  return std::make_unique<NetworkAsBackendComputation>(this);
+}
+
+}  // namespace
+
+NetworkAsBackendFactory::NetworkAsBackendFactory(const std::string& name,
+                                                 FactoryFunc factory,
+                                                 int priority)
+    : name_(name), factory_(factory), priority_(priority) {}
+
+std::unique_ptr<Backend> NetworkAsBackendFactory::Create(
+    const OptionsDict& options) {
+  const std::string backend_options =
+      options.Get<std::string>(SharedBackendParams::kBackendOptionsId);
+  OptionsDict network_options;
+  network_options.AddSubdictFromString(backend_options);
+
+  std::string net_path =
+      options.Get<std::string>(SharedBackendParams::kWeightsId);
+  std::optional<WeightsFile> weights = LoadWeights(net_path);
+  std::unique_ptr<Network> network =
+      factory_(std::move(weights), network_options);
+  network_options.CheckAllOptionsRead(name_);
+  return std::make_unique<NetworkAsBackend>(std::move(network), options);
+}
+
+}  // namespace lczero
diff --git a/src/neural/wrapper.h b/src/neural/wrapper.h
new file mode 100644
index 0000000000..c11c45bab3
--- /dev/null
+++ b/src/neural/wrapper.h
@@ -0,0 +1,56 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2024 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#pragma once
+
+#include <functional>
+#include <string>
+
+#include "neural/network.h"
+#include "neural/register.h"
+
+namespace lczero {
+
+class NetworkAsBackendFactory : public BackendFactory {
+ public:
+  using FactoryFunc = std::function<std::unique_ptr<Network>(
+      const std::optional<WeightsFile>&, const OptionsDict&)>;
+
+  NetworkAsBackendFactory(const std::string& name, FactoryFunc factory,
+                          int priority = 0);
+
+  int GetPriority() const override { return priority_; }
+  std::string_view GetName() const override { return name_; }
+  std::unique_ptr<Backend> Create(const OptionsDict&) override;
+
+ private:
+  std::string name_;
+  FactoryFunc factory_;
+  int priority_;
+};
+
+}  // namespace lczero
\ No newline at end of file
diff --git a/src/neural/xla/hlo_builder.cc b/src/neural/xla/hlo_builder.cc
index 873d2d50a8..d018eb5deb 100644
--- a/src/neural/xla/hlo_builder.cc
+++ b/src/neural/xla/hlo_builder.cc
@@ -536,7 +536,7 @@ std::optional<HloComputation> HloBuilder::GetComputationId(
 HloComputation HloBuilder::AddComputation(std::string_view name,
                                           const HloBuilder& builder) {
   std::unordered_map<size_t, size_t> id_map;
-  if (computation_names_.count(std::string(name))) {
+  if (computation_names_.contains(std::string(name))) {
     throw Exception("Computation with name " + std::string(name) +
                     " already exists");
   }
diff --git a/src/neural/xla/hlo_builder.h b/src/neural/xla/hlo_builder.h
index 1211446765..652ccd6326 100644
--- a/src/neural/xla/hlo_builder.h
+++ b/src/neural/xla/hlo_builder.h
@@ -32,7 +32,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "neural/xla/hlo.pb.h"
+#include "proto/hlo.pb.h"
 #include "utils/logging.h"
 
 namespace lczero {
@@ -187,4 +187,4 @@ class HloContext {
   pblczero::XlaOpMetadata saved_metadata_;
 };
 
-}  // namespace lczero
\ No newline at end of file
+}  // namespace lczero
diff --git a/src/neural/xla/onnx2hlo.cc b/src/neural/xla/onnx2hlo.cc
index a33923de25..c6211ca2ba 100644
--- a/src/neural/xla/onnx2hlo.cc
+++ b/src/neural/xla/onnx2hlo.cc
@@ -32,8 +32,6 @@
 #include <unordered_map>
 #include <unordered_set>
 
-#include "neural/onnx/onnx.pb.h"
-#include "neural/xla/hlo.pb.h"
 #include "neural/xla/hlo_builder.h"
 #include "neural/xla/print_hlo.h"
 #include "utils/bf16_utils.h"
@@ -273,7 +271,7 @@ pblczero::XlaLiteralProto ConstOpMax(const pblczero::XlaLiteralProto& lhs,
             typename std::remove_reference<decltype(lhs)>::type::value_type;
         std::transform(lhs.begin(), lhs.end(), rhs.begin(),
                        std::back_inserter(*dst),
-                       [](T a, T b) { return std::max(a, b); });
+                       [](const T &a, const T &b) { return std::max(a, b); });
       });
   return result;
 }
@@ -474,6 +472,8 @@ class Onnx2HloConverter {
     onnx_op_to_builder_["Gather"] = &Onnx2HloConverter::OpGather;
     onnx_op_to_builder_["GlobalAveragePool"] =
         &Onnx2HloConverter::OpGlobalAveragePool;
+    onnx_op_to_builder_["Greater"] = &Onnx2HloConverter::OpGreater;
+    onnx_op_to_builder_["Exp"] = &Onnx2HloConverter::OpExp;
     onnx_op_to_builder_["Expand"] = &Onnx2HloConverter::OpExpand;
     onnx_op_to_builder_["Identity"] = &Onnx2HloConverter::OpIdentity;
     onnx_op_to_builder_["LayerNormalization"] =
@@ -502,6 +502,7 @@ class Onnx2HloConverter {
     onnx_op_to_builder_["Tanh"] = &Onnx2HloConverter::OpTanh;
     onnx_op_to_builder_["Transpose"] = &Onnx2HloConverter::OpTranspose;
     onnx_op_to_builder_["Unsqueeze"] = &Onnx2HloConverter::OpUnsqueeze;
+    onnx_op_to_builder_["Where"] = &Onnx2HloConverter::OpWhere;
   }
 
   Onnx2HloResult Convert(const pblczero::ModelProto& onnx_model,
@@ -657,7 +658,7 @@ class Onnx2HloConverter {
   bool AllInputsConstant(const pblczero::NodeProto& node) {
     for (const auto& input : node.input()) {
       const std::string name(input);
-      if (initializers_.count(name)) continue;
+      if (initializers_.contains(name)) continue;
       if (auto iter = onnx_name_to_hlo_flow_.find(name);
           iter != onnx_name_to_hlo_flow_.end() &&
           iter->second->opcode() == "constant") {
@@ -1492,6 +1493,31 @@ class Onnx2HloConverter {
     return {builder_.Multiply(flow, input)};
   }
 
+  std::vector<HloFlow> OpExp(const pblczero::NodeProto& node) {
+    CheckKnownAttributes(node, 1, {});
+    auto* input = GetInput(node, 0);
+    return {builder_.Exponential(input)};
+  }
+
+  std::vector<HloFlow> OpGreater(const pblczero::NodeProto& node) {
+    CheckKnownAttributes(node, 2, {});
+    auto* lhs = GetInput(node, 0);
+    auto* rhs = GetInput(node, 1);
+    std::tie(lhs, rhs) = EqualizeShape(lhs, rhs);
+    return {builder_.Compare(lhs, rhs, "GT")};
+  }
+
+  std::vector<HloFlow> OpWhere(const pblczero::NodeProto& node) {
+    CheckKnownAttributes(node, 3, {});
+    auto* pred = GetInput(node, 0);
+    auto* on_true = GetInput(node, 1);
+    auto* on_false = GetInput(node, 2);
+    std::tie(on_true, on_false) = EqualizeShape(on_true, on_false);
+    std::tie(pred, on_true) = EqualizeShape(pred, on_true);
+    std::tie(pred, on_false) = EqualizeShape(pred, on_false);
+    return {builder_.Select(pred, on_true, on_false)};
+  }
+
   /////////////////////////////////////////////////////////////////////////////
   // Helper computations
   /////////////////////////////////////////////////////////////////////////////
@@ -1718,4 +1744,4 @@ std::unique_ptr<XlaTensor> OnnxTensorToXlaTensor(
       onnx_tensor.raw_data());
 }
 
-}  // namespace lczero
\ No newline at end of file
+}  // namespace lczero
diff --git a/src/neural/xla/onnx2hlo.h b/src/neural/xla/onnx2hlo.h
index fe2f1876d4..ddc7bd16df 100644
--- a/src/neural/xla/onnx2hlo.h
+++ b/src/neural/xla/onnx2hlo.h
@@ -27,12 +27,13 @@
 
 #pragma once
 
+#include <optional>
 #include <string_view>
 #include <vector>
 
-#include "neural/onnx/onnx.pb.h"
-#include "neural/xla/hlo.pb.h"
-#include "neural/xla/xla_runner.h"
+#include "neural/xla/xla_tensor.h"
+#include "proto/hlo.pb.h"
+#include "proto/onnx.pb.h"
 
 namespace lczero {
 
diff --git a/src/neural/xla/print_hlo.h b/src/neural/xla/print_hlo.h
index e906bbe346..c7db16cbfa 100644
--- a/src/neural/xla/print_hlo.h
+++ b/src/neural/xla/print_hlo.h
@@ -29,7 +29,7 @@
 
 #include <iostream>
 
-#include "neural/xla/hlo.pb.h"
+#include "proto/hlo.pb.h"
 
 namespace lczero {
 
@@ -43,4 +43,4 @@ struct PrettyPrintHloOptions {
 void PrettyPrintHlo(const pblczero::HloModuleProto& module,
                     PrettyPrintHloOptions options, std::ostream& stream);
 
-}  // namespace lczero
\ No newline at end of file
+}  // namespace lczero
diff --git a/src/neural/xla/xla_tensor.h b/src/neural/xla/xla_tensor.h
index b49766b9cf..43f9899522 100644
--- a/src/neural/xla/xla_tensor.h
+++ b/src/neural/xla/xla_tensor.h
@@ -33,7 +33,7 @@
 #include <string>
 #include <vector>
 
-#include "neural/xla/hlo.pb.h"
+#include "proto/hlo.pb.h"
 #include "utils/exception.h"
 
 namespace lczero {
@@ -136,4 +136,4 @@ class XlaMutableTensor : public XlaTensor {
   std::unique_ptr<char[]> data_;
 };
 
-}  // namespace lczero
\ No newline at end of file
+}  // namespace lczero
diff --git a/src/python/weights.h b/src/python/weights.h
index 18288c5f69..53ad0968e4 100644
--- a/src/python/weights.h
+++ b/src/python/weights.h
@@ -235,7 +235,8 @@ class Backend {
 class GameState {
  public:
   GameState(const std::optional<std::string> startpos,
-            const std::vector<std::string>& moves) {
+            const std::vector<std::string>& moves,
+            const bool is_c960): is_c960_(is_c960) {
     ChessBoard starting_board;
     int no_capture_ply;
     int full_moves;
@@ -246,12 +247,16 @@ class GameState {
                    full_moves * 2 - (starting_board.flipped() ? 1 : 2));
 
     for (const auto& m : moves) {
-      Move move(m, history_.IsBlackToMove());
-      move = history_.Last().GetBoard().GetModernMove(move);
+      auto board = history_.Last().GetBoard();
+      Move move = board.ParseMove(m);
       history_.Append(move);
     }
   }
 
+  GameState(const std::optional<std::string> startpos,
+            const std::vector<std::string>& moves)
+      : GameState(startpos, moves, false) {}
+
   std::unique_ptr<Input> as_input(const Backend& backend) const {
     int tmp;
     return std::make_unique<Input>(
@@ -265,8 +270,8 @@ class GameState {
     bool is_black = history_.IsBlackToMove();
     std::vector<std::string> result;
     for (auto m : ms) {
-      if (is_black) m.Mirror();
-      result.push_back(m.as_string());
+      if (is_black) m.Flip();
+      result.push_back(m.ToString(is_c960_));
     }
     return result;
   }
@@ -275,20 +280,20 @@ class GameState {
     auto ms = history_.Last().GetBoard().GenerateLegalMoves();
     std::vector<int> result;
     for (auto m : ms) {
-      result.push_back(m.as_nn_index(/* transform= */ 0));
+      result.push_back(MoveToNNIndex(m, /* transform= */ 0));
     }
     return result;
   }
 
   std::string as_string() const {
-    bool is_black = history_.IsBlackToMove();
-    return (is_black ? history_.Last().GetThemBoard()
-                     : history_.Last().GetBoard())
-        .DebugString();
+    auto board = history_.Last().GetBoard();
+    if (history_.IsBlackToMove()) board.Mirror();
+    return board.DebugString();
   }
 
  private:
   PositionHistory history_;
+  bool is_c960_;
 };
 
 }  // namespace python
diff --git a/src/rescorer/rescoreloop.cc b/src/rescorer/rescoreloop.cc
deleted file mode 100644
index 2da34c3c22..0000000000
--- a/src/rescorer/rescoreloop.cc
+++ /dev/null
@@ -1,1359 +0,0 @@
-/*
-  This file is part of Leela Chess Zero.
-  Copyright (C) 2018-2024 The LCZero Authors
-
-  Leela Chess is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  Leela Chess is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
-
-  Additional permission under GNU GPL version 3 section 7
-
-  If you modify this Program, or any covered work, by linking or
-  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
-  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
-  modified version of those libraries), containing parts covered by the
-  terms of the respective license agreement, the licensors of this
-  Program grant you additional permission to convey the resulting work.
-*/
-
-#include "rescorer/rescoreloop.h"
-
-#include <optional>
-#include <sstream>
-
-#include "gtb-probe.h"
-#include "neural/decoder.h"
-#include "syzygy/syzygy.h"
-#include "trainingdata/reader.h"
-#include "utils/filesystem.h"
-#include "utils/optionsparser.h"
-
-namespace lczero {
-
-namespace {
-const OptionId kSyzygyTablebaseId{"syzygy-paths", "",
-                                  "List of Syzygy tablebase directories"};
-const OptionId kGaviotaTablebaseId{"gaviotatb-paths", "",
-                                   "List of Gaviota tablebase directories"};
-const OptionId kInputDirId{
-    "input", "", "Directory with gzipped files in need of rescoring."};
-const OptionId kPolicySubsDirId{"policy-substitutions", "",
-                                "Directory with gzipped files are to use to "
-                                "replace policy for some of the data."};
-const OptionId kOutputDirId{"output", "", "Directory to write rescored files."};
-const OptionId kThreadsId{"threads", "",
-                          "Number of concurrent threads to rescore with.", 't'};
-const OptionId kTempId{"temperature", "",
-                       "Additional temperature to apply to policy target."};
-const OptionId kDistributionOffsetId{
-    "dist_offset", "",
-    "Additional offset to apply to policy target before temperature."};
-const OptionId kMinDTZBoostId{
-    "dtz_policy_boost", "",
-    "Additional offset to apply to policy target before temperature for moves "
-    "that are best dtz option."};
-const OptionId kNewInputFormatId{
-    "new-input-format", "",
-    "Input format to convert training data to during rescoring."};
-const OptionId kDeblunder{
-    "deblunder", "",
-    "If true, whether to use move Q information to infer a different Z value "
-    "if the the selected move appears to be a blunder."};
-const OptionId kDeblunderQBlunderThreshold{
-    "deblunder-q-blunder-threshold", "",
-    "The amount Q of played move needs to be worse than best move in order to "
-    "assume the played move is a blunder."};
-const OptionId kDeblunderQBlunderWidth{
-    "deblunder-q-blunder-width", "",
-    "Width of the transition between accepted temp moves and blunders."};
-const OptionId kNnuePlainFileId{"nnue-plain-file", "",
-                                "Append SF plain format training data to this "
-                                "file. Will be generated if not there."};
-const OptionId kNnueBestScoreId{"nnue-best-score", "",
-                                "For the SF training data use the score of the "
-                                "best move instead of the played one."};
-const OptionId kNnueBestMoveId{
-    "nnue-best-move", "",
-    "For the SF training data record the best move instead of the played one. "
-    "If set to true the generated files do not compress well."};
-const OptionId kDeleteFilesId{"delete-files", "",
-                              "Delete the input files after processing."};
-
-class PolicySubNode {
- public:
-  PolicySubNode() {
-    for (int i = 0; i < 1858; i++) children[i] = nullptr;
-  }
-  bool active = false;
-  float policy[1858];
-  PolicySubNode* children[1858];
-};
-
-std::atomic<int> games(0);
-std::atomic<int> positions(0);
-std::atomic<int> rescored(0);
-std::atomic<int> delta(0);
-std::atomic<int> rescored2(0);
-std::atomic<int> rescored3(0);
-std::atomic<int> blunders(0);
-std::atomic<int> orig_counts[3];
-std::atomic<int> fixed_counts[3];
-std::atomic<int> policy_bump(0);
-std::atomic<int> policy_nobump_total_hist[11];
-std::atomic<int> policy_bump_total_hist[11];
-std::atomic<int> policy_dtm_bump(0);
-std::atomic<int> gaviota_dtm_rescores(0);
-std::map<uint64_t, PolicySubNode> policy_subs;
-bool gaviotaEnabled = false;
-bool deblunderEnabled = false;
-float deblunderQBlunderThreshold = 2.0f;
-float deblunderQBlunderWidth = 0.0f;
-
-void DataAssert(bool check_result) {
-  if (!check_result) throw Exception("Range Violation");
-}
-
-void Validate(const std::vector<V6TrainingData>& fileContents) {
-  if (fileContents.empty()) throw Exception("Empty File");
-
-  for (size_t i = 0; i < fileContents.size(); i++) {
-    auto& data = fileContents[i];
-    DataAssert(
-        data.input_format ==
-            pblczero::NetworkFormat::INPUT_CLASSICAL_112_PLANE ||
-        data.input_format ==
-            pblczero::NetworkFormat::INPUT_112_WITH_CASTLING_PLANE ||
-        data.input_format ==
-            pblczero::NetworkFormat::INPUT_112_WITH_CANONICALIZATION ||
-        data.input_format == pblczero::NetworkFormat::
-                                 INPUT_112_WITH_CANONICALIZATION_HECTOPLIES ||
-        data.input_format ==
-            pblczero::NetworkFormat::
-                INPUT_112_WITH_CANONICALIZATION_HECTOPLIES_ARMAGEDDON ||
-        data.input_format ==
-            pblczero::NetworkFormat::INPUT_112_WITH_CANONICALIZATION_V2 ||
-        data.input_format == pblczero::NetworkFormat::
-                                 INPUT_112_WITH_CANONICALIZATION_V2_ARMAGEDDON);
-    DataAssert(data.best_d >= 0.0f && data.best_d <= 1.0f);
-    DataAssert(data.root_d >= 0.0f && data.root_d <= 1.0f);
-    DataAssert(data.best_q >= -1.0f && data.best_q <= 1.0f);
-    DataAssert(data.root_q >= -1.0f && data.root_q <= 1.0f);
-    DataAssert(data.root_m >= 0.0f);
-    DataAssert(data.best_m >= 0.0f);
-    DataAssert(data.plies_left >= 0.0f);
-    switch (data.input_format) {
-      case pblczero::NetworkFormat::INPUT_CLASSICAL_112_PLANE:
-        DataAssert(data.castling_them_oo <= 1);
-        DataAssert(data.castling_them_ooo <= 1);
-        DataAssert(data.castling_us_oo <= 1);
-        DataAssert(data.castling_us_ooo <= 1);
-        break;
-      default:
-        // Verifiy at most one bit set.
-        DataAssert((data.castling_them_oo & (data.castling_them_oo - 1)) == 0);
-        DataAssert((data.castling_them_ooo & (data.castling_them_ooo - 1)) ==
-                   0);
-        DataAssert((data.castling_us_oo & (data.castling_us_oo - 1)) == 0);
-        DataAssert((data.castling_us_ooo & (data.castling_us_ooo - 1)) == 0);
-    }
-    if (IsCanonicalFormat(static_cast<pblczero::NetworkFormat::InputFormat>(
-            data.input_format))) {
-      // At most one en-passant bit.
-      DataAssert((data.side_to_move_or_enpassant &
-                  (data.side_to_move_or_enpassant - 1)) == 0);
-    } else {
-      DataAssert(data.side_to_move_or_enpassant <= 1);
-    }
-    DataAssert(data.result_q >= -1 && data.result_q <= 1);
-    DataAssert(data.result_d >= 0 && data.result_q <= 1);
-    DataAssert(data.rule50_count <= 100);
-    float sum = 0.0f;
-    for (size_t j = 0; j < sizeof(data.probabilities) / sizeof(float); j++) {
-      float prob = data.probabilities[j];
-      DataAssert((prob >= 0.0f && prob <= 1.0f) || prob == -1.0f ||
-                 std::isnan(prob));
-      if (prob >= 0.0f) {
-        sum += prob;
-      }
-      // Only check best_idx/played_idx for real v6 data.
-      if (data.visits > 0) {
-        // Best_idx and played_idx must be marked legal in probabilities.
-        if (j == data.best_idx || j == data.played_idx) {
-          DataAssert(prob >= 0.0f);
-        }
-      }
-    }
-    if (sum < 0.99f || sum > 1.01f) {
-      throw Exception("Probability sum error is huge!");
-    }
-    DataAssert(data.best_idx <= 1858);
-    DataAssert(data.played_idx <= 1858);
-    DataAssert(data.played_q >= -1.0f && data.played_q <= 1.0f);
-    DataAssert(data.played_d >= 0.0f && data.played_d <= 1.0f);
-    DataAssert(data.played_m >= 0.0f);
-    DataAssert(std::isnan(data.orig_q) ||
-               (data.orig_q >= -1.0f && data.orig_q <= 1.0f));
-    DataAssert(std::isnan(data.orig_d) ||
-               (data.orig_d >= 0.0f && data.orig_d <= 1.0f));
-    DataAssert(std::isnan(data.orig_m) || data.orig_m >= 0.0f);
-    // TODO: if visits > 0 - assert best_idx/played_idx are valid in
-    // probabilities.
-  }
-}
-
-void Validate(const std::vector<V6TrainingData>& fileContents,
-              const MoveList& moves) {
-  PositionHistory history;
-  int rule50ply;
-  int gameply;
-  ChessBoard board;
-  auto input_format = static_cast<pblczero::NetworkFormat::InputFormat>(
-      fileContents[0].input_format);
-  PopulateBoard(input_format, PlanesFromTrainingData(fileContents[0]), &board,
-                &rule50ply, &gameply);
-  history.Reset(board, rule50ply, gameply);
-  for (size_t i = 0; i < moves.size(); i++) {
-    int transform = TransformForPosition(input_format, history);
-    // If real v6 data, can confirm that played_idx matches the inferred move.
-    if (fileContents[i].visits > 0) {
-      if (fileContents[i].played_idx != moves[i].as_nn_index(transform)) {
-        throw Exception("Move performed is not listed as played.");
-      }
-    }
-    // Move shouldn't be marked illegal unless there is 0 visits, which should
-    // only happen if invariance_info is marked with the placeholder bit.
-    if (!(fileContents[i].probabilities[moves[i].as_nn_index(transform)] >=
-          0.0f) &&
-        (fileContents[i].invariance_info & 64) == 0) {
-      std::cerr << "Illegal move: " << moves[i].as_string() << std::endl;
-      throw Exception("Move performed is marked illegal in probabilities.");
-    }
-    auto legal = history.Last().GetBoard().GenerateLegalMoves();
-    if (std::find(legal.begin(), legal.end(), moves[i]) == legal.end()) {
-      std::cerr << "Illegal move: " << moves[i].as_string() << std::endl;
-      throw Exception("Move performed is an illegal move.");
-    }
-    history.Append(moves[i]);
-  }
-}
-
-void gaviota_tb_probe_hard(const Position& pos, unsigned int& info,
-                           unsigned int& dtm) {
-  unsigned int wsq[17];
-  unsigned int bsq[17];
-  unsigned char wpc[17];
-  unsigned char bpc[17];
-
-  auto stm = pos.IsBlackToMove() ? tb_BLACK_TO_MOVE : tb_WHITE_TO_MOVE;
-  auto& board = pos.IsBlackToMove() ? pos.GetThemBoard() : pos.GetBoard();
-  auto epsq = tb_NOSQUARE;
-  for (auto sq : board.en_passant()) {
-    // Our internal representation stores en_passant 2 rows away
-    // from the actual sq.
-    if (sq.row() == 0) {
-      epsq = (TB_squares)(sq.as_int() + 16);
-    } else {
-      epsq = (TB_squares)(sq.as_int() - 16);
-    }
-  }
-  int idx = 0;
-  for (auto sq : (board.ours() & board.kings())) {
-    wsq[idx] = (TB_squares)sq.as_int();
-    wpc[idx] = tb_KING;
-    idx++;
-  }
-  for (auto sq : (board.ours() & board.knights())) {
-    wsq[idx] = (TB_squares)sq.as_int();
-    wpc[idx] = tb_KNIGHT;
-    idx++;
-  }
-  for (auto sq : (board.ours() & board.queens())) {
-    wsq[idx] = (TB_squares)sq.as_int();
-    wpc[idx] = tb_QUEEN;
-    idx++;
-  }
-  for (auto sq : (board.ours() & board.rooks())) {
-    wsq[idx] = (TB_squares)sq.as_int();
-    wpc[idx] = tb_ROOK;
-    idx++;
-  }
-  for (auto sq : (board.ours() & board.bishops())) {
-    wsq[idx] = (TB_squares)sq.as_int();
-    wpc[idx] = tb_BISHOP;
-    idx++;
-  }
-  for (auto sq : (board.ours() & board.pawns())) {
-    wsq[idx] = (TB_squares)sq.as_int();
-    wpc[idx] = tb_PAWN;
-    idx++;
-  }
-  wsq[idx] = tb_NOSQUARE;
-  wpc[idx] = tb_NOPIECE;
-
-  idx = 0;
-  for (auto sq : (board.theirs() & board.kings())) {
-    bsq[idx] = (TB_squares)sq.as_int();
-    bpc[idx] = tb_KING;
-    idx++;
-  }
-  for (auto sq : (board.theirs() & board.knights())) {
-    bsq[idx] = (TB_squares)sq.as_int();
-    bpc[idx] = tb_KNIGHT;
-    idx++;
-  }
-  for (auto sq : (board.theirs() & board.queens())) {
-    bsq[idx] = (TB_squares)sq.as_int();
-    bpc[idx] = tb_QUEEN;
-    idx++;
-  }
-  for (auto sq : (board.theirs() & board.rooks())) {
-    bsq[idx] = (TB_squares)sq.as_int();
-    bpc[idx] = tb_ROOK;
-    idx++;
-  }
-  for (auto sq : (board.theirs() & board.bishops())) {
-    bsq[idx] = (TB_squares)sq.as_int();
-    bpc[idx] = tb_BISHOP;
-    idx++;
-  }
-  for (auto sq : (board.theirs() & board.pawns())) {
-    bsq[idx] = (TB_squares)sq.as_int();
-    bpc[idx] = tb_PAWN;
-    idx++;
-  }
-  bsq[idx] = tb_NOSQUARE;
-  bpc[idx] = tb_NOPIECE;
-
-  tb_probe_hard(stm, epsq, tb_NOCASTLE, wsq, bsq, wpc, bpc, &info, &dtm);
-}
-
-void ChangeInputFormat(int newInputFormat, V6TrainingData* data,
-                       const PositionHistory& history) {
-  data->input_format = newInputFormat;
-  auto input_format =
-      static_cast<pblczero::NetworkFormat::InputFormat>(newInputFormat);
-
-  // Populate planes.
-  int transform;
-  InputPlanes planes = EncodePositionForNN(input_format, history, 8,
-                                           FillEmptyHistory::NO, &transform);
-  int plane_idx = 0;
-  for (auto& plane : data->planes) {
-    plane = ReverseBitsInBytes(planes[plane_idx++].mask);
-  }
-
-  if ((data->invariance_info & 7) != transform) {
-    // Probabilities need reshuffling.
-    float newProbs[1858];
-    std::fill(std::begin(newProbs), std::end(newProbs), -1);
-    bool played_fixed = false;
-    bool best_fixed = false;
-    for (auto move : history.Last().GetBoard().GenerateLegalMoves()) {
-      int i = move.as_nn_index(transform);
-      int j = move.as_nn_index(data->invariance_info & 7);
-      newProbs[i] = data->probabilities[j];
-      // For V6 data only, the played/best idx need updating.
-      if (data->visits > 0) {
-        if (data->played_idx == j && !played_fixed) {
-          data->played_idx = i;
-          played_fixed = true;
-        }
-        if (data->best_idx == j && !best_fixed) {
-          data->best_idx = i;
-          best_fixed = true;
-        }
-      }
-    }
-    for (int i = 0; i < 1858; i++) {
-      data->probabilities[i] = newProbs[i];
-    }
-  }
-
-  const auto& position = history.Last();
-  const auto& castlings = position.GetBoard().castlings();
-  // Populate castlings.
-  // For non-frc trained nets, just send 1 like we used to.
-  uint8_t our_queen_side = 1;
-  uint8_t our_king_side = 1;
-  uint8_t their_queen_side = 1;
-  uint8_t their_king_side = 1;
-  // If frc trained, send the bit mask representing rook position.
-  if (Is960CastlingFormat(input_format)) {
-    our_queen_side <<= castlings.our_queenside_rook();
-    our_king_side <<= castlings.our_kingside_rook();
-    their_queen_side <<= castlings.their_queenside_rook();
-    their_king_side <<= castlings.their_kingside_rook();
-  }
-
-  data->castling_us_ooo = castlings.we_can_000() ? our_queen_side : 0;
-  data->castling_us_oo = castlings.we_can_00() ? our_king_side : 0;
-  data->castling_them_ooo = castlings.they_can_000() ? their_queen_side : 0;
-  data->castling_them_oo = castlings.they_can_00() ? their_king_side : 0;
-
-  // Save the bits that aren't connected to the input_format.
-  uint8_t invariance_mask = data->invariance_info & 0x78;
-  // Other params.
-  if (IsCanonicalFormat(input_format)) {
-    data->side_to_move_or_enpassant =
-        position.GetBoard().en_passant().as_int() >> 56;
-    if ((transform & FlipTransform) != 0) {
-      data->side_to_move_or_enpassant =
-          ReverseBitsInBytes(data->side_to_move_or_enpassant);
-    }
-    // Send transform in deprecated move count so rescorer can reverse it to
-    // calculate the actual move list from the input data.
-    data->invariance_info =
-        transform | (position.IsBlackToMove() ? (1u << 7) : 0u);
-  } else {
-    data->side_to_move_or_enpassant = position.IsBlackToMove() ? 1 : 0;
-    data->invariance_info = 0;
-  }
-  // Put the mask back.
-  data->invariance_info |= invariance_mask;
-}
-
-int ResultForData(const V6TrainingData& data) {
-  // Ensure we aren't reprocessing some data that has had custom adjustments to
-  // result training target applied.
-  DataAssert(data.result_q == -1.0f || data.result_q == 1.0f ||
-             data.result_q == 0.0f);
-  // Paranoia - ensure int cast never breaks the value.
-  DataAssert(data.result_q ==
-             static_cast<float>(static_cast<int>(data.result_q)));
-  return static_cast<int>(data.result_q);
-}
-
-std::string AsNnueString(const Position& p, Move m, float q, int result) {
-  std::ostringstream out;
-  out << "fen " << GetFen(p) << std::endl;
-  m = p.GetBoard().GetLegacyMove(m);
-  if (m.from().row() == ChessBoard::Rank::RANK_7 &&
-      p.GetBoard().pawns().get(m.from()) &&
-      m.promotion() == Move::Promotion::None) {
-    m.SetPromotion(Move::Promotion::Knight);
-  }
-  if (p.IsBlackToMove()) m.Mirror();
-  out << "move " << m.as_string() << std::endl;
-  // Formula from PR1477 adjuster for SF PawnValueEg.
-  out << "score " << round(660.6 * q / (1 - 0.9751875 * std::pow(q, 10)))
-      << std::endl;
-  out << "ply " << p.GetGamePly() << std::endl;
-  out << "result " << result << std::endl;
-  out << "e" << std::endl;
-  return out.str();
-}
-
-struct ProcessFileFlags {
-  bool delete_files : 1;
-  bool nnue_best_score : 1;
-  bool nnue_best_move : 1;
-};
-
-void ProcessFile(const std::string& file, SyzygyTablebase* tablebase,
-                 std::string outputDir, float distTemp, float distOffset,
-                 float dtzBoost, int newInputFormat,
-                 std::string nnue_plain_file, ProcessFileFlags flags) {
-  // Scope to ensure reader and writer are closed before deleting source file.
-  {
-    try {
-      TrainingDataReader reader(file);
-      std::vector<V6TrainingData> fileContents;
-      V6TrainingData data;
-      while (reader.ReadChunk(&data)) {
-        fileContents.push_back(data);
-      }
-      Validate(fileContents);
-      MoveList moves;
-      for (size_t i = 1; i < fileContents.size(); i++) {
-        moves.push_back(
-            DecodeMoveFromInput(PlanesFromTrainingData(fileContents[i]),
-                                PlanesFromTrainingData(fileContents[i - 1])));
-        // All moves decoded are from the point of view of the side after the
-        // move so need to mirror them all to be applicable to apply to the
-        // position before.
-        moves.back().Mirror();
-      }
-      Validate(fileContents, moves);
-      games += 1;
-      positions += fileContents.size();
-      PositionHistory history;
-      int rule50ply;
-      int gameply;
-      ChessBoard board;
-      auto input_format = static_cast<pblczero::NetworkFormat::InputFormat>(
-          fileContents[0].input_format);
-      PopulateBoard(input_format, PlanesFromTrainingData(fileContents[0]),
-                    &board, &rule50ply, &gameply);
-      history.Reset(board, rule50ply, gameply);
-      uint64_t rootHash = HashCat(board.Hash(), rule50ply);
-      if (policy_subs.find(rootHash) != policy_subs.end()) {
-        PolicySubNode* rootNode = &policy_subs[rootHash];
-        for (size_t i = 0; i < fileContents.size(); i++) {
-          if (rootNode->active) {
-            /* Some logic for choosing a softmax to apply to better align the
-            new policy with the old policy...
-            double bestkld =
-              std::numeric_limits<double>::max(); float besttemp = 1.0f;
-            // Minima is usually in this range for 'better' data.
-            for (float temp = 1.0f; temp < 3.0f; temp += 0.1f) {
-              float soft[1858];
-              float sum = 0.0f;
-              for (int j = 0; j < 1858; j++) {
-                if (rootNode->policy[j] >= 0.0) {
-                  soft[j] = std::pow(rootNode->policy[j], 1.0f / temp);
-                  sum += soft[j];
-                } else {
-                  soft[j] = -1.0f;
-                }
-              }
-              double kld = 0.0;
-              for (int j = 0; j < 1858; j++) {
-                if (soft[j] >= 0.0) soft[j] /= sum;
-                if (rootNode->policy[j] > 0.0 &&
-                    fileContents[i].probabilities[j] > 0) {
-                  kld += -1.0f * soft[j] *
-                    std::log(fileContents[i].probabilities[j] / soft[j]);
-                }
-              }
-              if (kld < bestkld) {
-                bestkld = kld;
-                besttemp = temp;
-              }
-            }
-            std::cerr << i << " " << besttemp << " " << bestkld << std::endl;
-            */
-            for (int j = 0; j < 1858; j++) {
-              /*
-              if (rootNode->policy[j] >= 0.0) {
-                std::cerr << i << " " << j << " " << rootNode->policy[j] << " "
-                          << fileContents[i].probabilities[j] << std::endl;
-              }
-              */
-              fileContents[i].probabilities[j] = rootNode->policy[j];
-            }
-          }
-          if (i + 1 < fileContents.size()) {
-            int transform = TransformForPosition(input_format, history);
-            int idx = moves[i].as_nn_index(transform);
-            if (rootNode->children[idx] == nullptr) {
-              break;
-            }
-            rootNode = rootNode->children[idx];
-            history.Append(moves[i]);
-          }
-        }
-      }
-
-      PopulateBoard(input_format, PlanesFromTrainingData(fileContents[0]),
-                    &board, &rule50ply, &gameply);
-      history.Reset(board, rule50ply, gameply);
-      int last_rescore = -1;
-      orig_counts[ResultForData(fileContents[0]) + 1]++;
-      fixed_counts[ResultForData(fileContents[0]) + 1]++;
-      for (int i = 0; i < static_cast<int>(moves.size()); i++) {
-        history.Append(moves[i]);
-        const auto& board = history.Last().GetBoard();
-        if (board.castlings().no_legal_castle() &&
-            history.Last().GetRule50Ply() == 0 &&
-            (board.ours() | board.theirs()).count() <=
-                tablebase->max_cardinality()) {
-          ProbeState state;
-          WDLScore wdl = tablebase->probe_wdl(history.Last(), &state);
-          // Only fail state means the WDL is wrong, probe_wdl may produce
-          // correct result with a stat other than OK.
-          if (state != FAIL) {
-            int8_t score_to_apply = 0;
-            if (wdl == WDL_WIN) {
-              score_to_apply = 1;
-            } else if (wdl == WDL_LOSS) {
-              score_to_apply = -1;
-            }
-            for (int j = i + 1; j > last_rescore; j--) {
-              if (ResultForData(fileContents[j]) != score_to_apply) {
-                if (j == i + 1 && last_rescore == -1) {
-                  fixed_counts[ResultForData(fileContents[0]) + 1]--;
-                  bool flip = (i % 2) == 0;
-                  fixed_counts[(flip ? -score_to_apply : score_to_apply) + 1]++;
-                  /*
-                  std::cerr << "Rescoring: " << file << " "  <<
-                  (int)fileContents[j].result << " -> "
-                            << (int)score_to_apply
-                            << std::endl;
-                            */
-                }
-                rescored += 1;
-                delta += abs(ResultForData(fileContents[j]) - score_to_apply);
-                /*
-              std::cerr << "Rescoring: " << (int)fileContents[j].result << " ->
-              "
-                        << (int)score_to_apply
-                        << std::endl;
-                        */
-              }
-
-              if (score_to_apply == 0) {
-                fileContents[j].result_d = 1.0f;
-              } else {
-                fileContents[j].result_d = 0.0f;
-              }
-              fileContents[j].result_q = static_cast<float>(score_to_apply);
-              score_to_apply = -score_to_apply;
-            }
-            last_rescore = i + 1;
-          }
-        }
-      }
-      PopulateBoard(input_format, PlanesFromTrainingData(fileContents[0]),
-                    &board, &rule50ply, &gameply);
-      history.Reset(board, rule50ply, gameply);
-      for (size_t i = 0; i < moves.size(); i++) {
-        history.Append(moves[i]);
-        const auto& board = history.Last().GetBoard();
-        if (board.castlings().no_legal_castle() &&
-            history.Last().GetRule50Ply() != 0 &&
-            (board.ours() | board.theirs()).count() <=
-                tablebase->max_cardinality()) {
-          ProbeState state;
-          WDLScore wdl = tablebase->probe_wdl(history.Last(), &state);
-          // Only fail state means the WDL is wrong, probe_wdl may produce
-          // correct result with a stat other than OK.
-          if (state != FAIL) {
-            int8_t score_to_apply = 0;
-            if (wdl == WDL_WIN) {
-              score_to_apply = 1;
-            } else if (wdl == WDL_LOSS) {
-              score_to_apply = -1;
-            }
-            // If the WDL result disagrees with the game outcome, make it a
-            // draw. WDL draw is always draw regardless of prior moves since
-            // zero, so that clearly works. Otherwise, the WDL result could be
-            // correct or draw, so best we can do is change scores that don't
-            // agree, to be a draw. If score was a draw this is a no-op, if it
-            // was opposite it becomes a draw.
-            int8_t new_score =
-                ResultForData(fileContents[i + 1]) != score_to_apply
-                    ? 0
-                    : ResultForData(fileContents[i + 1]);
-            bool dtz_rescored = false;
-            // if score is not already right, and the score to apply isn't 0,
-            // dtz can let us know its definitely correct.
-            if (ResultForData(fileContents[i + 1]) != score_to_apply &&
-                score_to_apply != 0) {
-              // Any repetitions in the history since last 50 ply makes it risky
-              // to assume dtz is still correct.
-              int steps = history.Last().GetRule50Ply();
-              bool no_reps = true;
-              for (int i = 0; i < steps; i++) {
-                // If game started from non-zero 50 move rule, this could
-                // underflow. Only safe option is to assume there were
-                // repetitions before this point.
-                if (history.GetLength() - i - 1 < 0) {
-                  no_reps = false;
-                  break;
-                }
-                if (history.GetPositionAt(history.GetLength() - i - 1)
-                        .GetRepetitions() != 0) {
-                  no_reps = false;
-                  break;
-                }
-              }
-              if (no_reps) {
-                int depth = tablebase->probe_dtz(history.Last(), &state);
-                if (state != FAIL) {
-                  // This should be able to be <= 99 safely, but I've not
-                  // convinced myself thats true.
-                  if (steps + std::abs(depth) < 99) {
-                    rescored3++;
-                    new_score = score_to_apply;
-                    dtz_rescored = true;
-                  }
-                }
-              }
-            }
-
-            // If score is not already a draw, and its not obviously a draw,
-            // check if 50 move rule has advanced so far its obviously a draw.
-            // Obviously not needed if we've already proven with dtz that its a
-            // win/loss.
-            if (ResultForData(fileContents[i + 1]) != 0 &&
-                score_to_apply != 0 && !dtz_rescored) {
-              int depth = tablebase->probe_dtz(history.Last(), &state);
-              if (state != FAIL) {
-                int steps = history.Last().GetRule50Ply();
-                // This should be able to be >= 101 safely, but I've not
-                // convinced myself thats true.
-                if (steps + std::abs(depth) > 101) {
-                  rescored3++;
-                  new_score = 0;
-                  dtz_rescored = true;
-                }
-              }
-            }
-            if (new_score != ResultForData(fileContents[i + 1])) {
-              rescored2 += 1;
-              /*
-            std::cerr << "Rescoring: " << (int)fileContents[j].result << " -> "
-                      << (int)score_to_apply
-                      << std::endl;
-                      */
-            }
-
-            if (new_score == 0) {
-              fileContents[i + 1].result_d = 1.0f;
-            } else {
-              fileContents[i + 1].result_d = 0.0f;
-            }
-            fileContents[i + 1].result_q = static_cast<float>(new_score);
-          }
-        }
-      }
-
-      if (distTemp != 1.0f || distOffset != 0.0f || dtzBoost != 0.0f) {
-        PopulateBoard(input_format, PlanesFromTrainingData(fileContents[0]),
-                      &board, &rule50ply, &gameply);
-        history.Reset(board, rule50ply, gameply);
-        int move_index = 0;
-        for (auto& chunk : fileContents) {
-          const auto& board = history.Last().GetBoard();
-          std::vector<bool> boost_probs(1858, false);
-          int boost_count = 0;
-
-          if (dtzBoost != 0.0f && board.castlings().no_legal_castle() &&
-              (board.ours() | board.theirs()).count() <=
-                  tablebase->max_cardinality()) {
-            MoveList to_boost;
-            MoveList maybe_boost;
-            tablebase->root_probe(history.Last(), true, true, &to_boost);
-            if (history.DidRepeatSinceLastZeroingMove()) {
-              maybe_boost = to_boost;
-            } else {
-              tablebase->root_probe(history.Last(), false, true, &maybe_boost);
-            }
-            // If there is only one move, dtm fixup is not helpful.
-            // This code assumes all gaviota 3-4-5 tbs are present, as checked
-            // at startup.
-            if (gaviotaEnabled && maybe_boost.size() > 1 &&
-                (board.ours() | board.theirs()).count() <= 5) {
-              std::vector<unsigned int> dtms;
-              dtms.resize(maybe_boost.size());
-              unsigned int mininum_dtm = 1000;
-              // Only safe moves being considered, boost the smallest dtm
-              // amongst them.
-              for (auto& move : maybe_boost) {
-                Position next_pos = Position(history.Last(), move);
-                unsigned int info;
-                unsigned int dtm;
-                gaviota_tb_probe_hard(next_pos, info, dtm);
-                dtms.push_back(dtm);
-                if (dtm < mininum_dtm) mininum_dtm = dtm;
-              }
-              if (mininum_dtm < 1000) {
-                to_boost.clear();
-                int dtm_idx = 0;
-                for (auto& move : maybe_boost) {
-                  if (dtms[dtm_idx] == mininum_dtm) {
-                    to_boost.push_back(move);
-                  }
-                  dtm_idx++;
-                }
-                policy_dtm_bump++;
-              }
-            }
-            int transform = TransformForPosition(input_format, history);
-            for (auto& move : to_boost) {
-              boost_probs[move.as_nn_index(transform)] = true;
-            }
-            boost_count = to_boost.size();
-          }
-          float sum = 0.0;
-          int prob_index = 0;
-          float preboost_sum = 0.0f;
-          for (auto& prob : chunk.probabilities) {
-            float offset =
-                distOffset +
-                (boost_probs[prob_index] ? (dtzBoost / boost_count) : 0.0f);
-            if (dtzBoost != 0.0f && boost_probs[prob_index]) {
-              preboost_sum += prob;
-              if (prob < 0 || std::isnan(prob))
-                std::cerr << "Bump for move that is illegal????" << std::endl;
-              policy_bump++;
-            }
-            prob_index++;
-            if (prob < 0 || std::isnan(prob)) continue;
-            prob = std::max(0.0f, prob + offset);
-            prob = std::pow(prob, 1.0f / distTemp);
-            sum += prob;
-          }
-          prob_index = 0;
-          float boost_sum = 0.0f;
-          for (auto& prob : chunk.probabilities) {
-            if (dtzBoost != 0.0f && boost_probs[prob_index]) {
-              boost_sum += prob / sum;
-            }
-            prob_index++;
-            if (prob < 0 || std::isnan(prob)) continue;
-            prob /= sum;
-          }
-          if (boost_count > 0) {
-            policy_nobump_total_hist[(int)(preboost_sum * 10)]++;
-            policy_bump_total_hist[(int)(boost_sum * 10)]++;
-          }
-          history.Append(moves[move_index]);
-          move_index++;
-        }
-      }
-
-      // Make move_count field plies_left for moves left head.
-      int offset = 0;
-      bool all_draws = true;
-      for (auto& chunk : fileContents) {
-        // plies_left can't be 0 for real v5 data, so if it is 0 it must be a v4
-        // conversion, and we should populate it ourselves with a better
-        // starting estimate.
-        if (chunk.plies_left == 0.0f) {
-          chunk.plies_left = (int)(fileContents.size() - offset);
-        }
-        offset++;
-        all_draws = all_draws && (ResultForData(chunk) == 0);
-      }
-
-      // Correct plies_left using Gaviota TBs for 5 piece and less positions.
-      if (gaviotaEnabled && !all_draws) {
-        PopulateBoard(input_format, PlanesFromTrainingData(fileContents[0]),
-                      &board, &rule50ply, &gameply);
-        history.Reset(board, rule50ply, gameply);
-        int last_rescore = 0;
-        for (size_t i = 0; i < moves.size(); i++) {
-          history.Append(moves[i]);
-          const auto& board = history.Last().GetBoard();
-
-          // Gaviota TBs don't have 50 move rule.
-          // Only consider positions that are not draw after rescoring.
-          if ((ResultForData(fileContents[i + 1]) != 0) &&
-              board.castlings().no_legal_castle() &&
-              (board.ours() | board.theirs()).count() <= 5) {
-            std::vector<int> dtms;
-            unsigned int info;
-            unsigned int dtm;
-            gaviota_tb_probe_hard(history.Last(), info, dtm);
-            if (info != tb_WMATE && info != tb_BMATE) {
-              // Not a win for either player.
-              continue;
-            }
-            int steps = history.Last().GetRule50Ply();
-            if ((dtm + steps > 99) && (dtm <= fileContents[i + 1].plies_left)) {
-              // Following DTM could trigger 50 move rule and the current
-              // move_count is more than DTM.
-              // If DTM is more than the current move_count then we can rescore
-              // using it since DTM50 is not shorter than DTM.
-              continue;
-            }
-            bool no_reps = true;
-            for (int i = 0; i < steps; i++) {
-              // If game started from non-zero 50 move rule, this could
-              // underflow. Only safe option is to assume there were repetitions
-              // before this point.
-              if (history.GetLength() - i - 1 < 0) {
-                no_reps = false;
-                break;
-              }
-              if (history.GetPositionAt(history.GetLength() - i - 1)
-                      .GetRepetitions() != 0) {
-                no_reps = false;
-                break;
-              }
-            }
-            if (!no_reps) {
-              // There were repetitions. Do nothing since DTM path
-              // could trigger draw by repetition.
-              continue;
-            }
-            gaviota_dtm_rescores++;
-            int j;
-            for (j = i; j >= -1; j--) {
-              if (j <= last_rescore) {
-                break;
-              }
-              // std::cerr << j << " " << int(fileContents[j + 1].move_count) <<
-              // " -> " << int(dtm + (i - j)) << std::endl;
-              fileContents[j + 1].plies_left = int(dtm + (i - j));
-            }
-            last_rescore = i;
-          }
-        }
-      }
-
-      // Correct move_count using DTZ for 3 piece no-pawn positions only.
-      // If Gaviota TBs are enabled no need to use syzygy.
-      if (!gaviotaEnabled && !all_draws) {
-        PopulateBoard(input_format, PlanesFromTrainingData(fileContents[0]),
-                      &board, &rule50ply, &gameply);
-        history.Reset(board, rule50ply, gameply);
-        for (size_t i = 0; i < moves.size(); i++) {
-          history.Append(moves[i]);
-          const auto& board = history.Last().GetBoard();
-          if (board.castlings().no_legal_castle() &&
-              (board.ours() | board.theirs()).count() <= 3 &&
-              board.pawns().empty()) {
-            ProbeState state;
-            WDLScore wdl = tablebase->probe_wdl(history.Last(), &state);
-            // Only fail state means the WDL is wrong, probe_wdl may produce
-            // correct result with a stat other than OK.
-            if (state != FAIL) {
-              int8_t score_to_apply = 0;
-              if (wdl == WDL_WIN) {
-                score_to_apply = 1;
-              } else if (wdl == WDL_LOSS) {
-                score_to_apply = -1;
-              }
-              // No point updating for draws.
-              if (score_to_apply == 0) continue;
-              // Any repetitions in the history since last 50 ply makes it risky
-              // to assume dtz is still correct.
-              int steps = history.Last().GetRule50Ply();
-              bool no_reps = true;
-              for (int i = 0; i < steps; i++) {
-                // If game started from non-zero 50 move rule, this could
-                // underflow. Only safe option is to assume there were
-                // repetitions before this point.
-                if (history.GetLength() - i - 1 < 0) {
-                  no_reps = false;
-                  break;
-                }
-                if (history.GetPositionAt(history.GetLength() - i - 1)
-                        .GetRepetitions() != 0) {
-                  no_reps = false;
-                  break;
-                }
-              }
-              if (no_reps) {
-                int depth = tablebase->probe_dtz(history.Last(), &state);
-                if (state != FAIL) {
-                  // if depth == -1 this is wrong, since that is mate and the
-                  // answer should be 0, but the move before depth is -2. Since
-                  // data never contains mate position, ignore that discrepency.
-                  int converted_ply_remaining = std::abs(depth);
-                  // This should be able to be <= 99 safely, but I've not
-                  // convinced myself thats true.
-                  if (steps + std::abs(depth) < 99) {
-                    fileContents[i + 1].plies_left = converted_ply_remaining;
-                  }
-                  if (steps == 0) {
-                    for (int j = i; j >= 0; j--) {
-                      fileContents[j].plies_left =
-                          converted_ply_remaining + (i + 1 - j);
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-      // Deblunder only works from v6 data onwards. We therefore check
-      // the visits field which is 0 if we're dealing with upgraded data.
-      if (deblunderEnabled && fileContents.back().visits > 0) {
-        PopulateBoard(input_format, PlanesFromTrainingData(fileContents[0]),
-                      &board, &rule50ply, &gameply);
-        history.Reset(board, rule50ply, gameply);
-        for (size_t i = 0; i < moves.size(); i++) {
-          history.Append(moves[i]);
-          const auto& board = history.Last().GetBoard();
-          if (board.castlings().no_legal_castle() &&
-              (board.ours() | board.theirs()).count() <=
-                  tablebase->max_cardinality()) {
-            history.Pop();
-            break;
-          }
-        }
-        float activeZ[3] = {fileContents.back().result_q,
-                            fileContents.back().result_d,
-                            fileContents.back().plies_left};
-        bool deblunderingStarted = false;
-        while (true) {
-          auto& cur = fileContents[history.GetLength() - 1];
-          // A blunder is defined by the played move being worse than the
-          // best move by a defined threshold, missing a forced win, or
-          // playing into a proven loss without being forced.
-          bool deblunderTriggerThreshold =
-              (cur.best_q - cur.played_q >
-               deblunderQBlunderThreshold - deblunderQBlunderWidth / 2.0);
-          bool deblunderTriggerTerminal =
-              (cur.best_q > -1 && cur.played_q < 1 &&
-               ((cur.best_q == 1 && ((cur.invariance_info & 8) != 0)) ||
-                cur.played_q == -1));
-          if (deblunderTriggerThreshold || deblunderTriggerTerminal) {
-            float newZRatio = 1.0f;
-            // If width > 0 and the deblunder didn't involve a terminal
-            // position, we apply a soft threshold by averaging old and new Z.
-            if (deblunderQBlunderWidth > 0 && !deblunderTriggerTerminal) {
-              newZRatio = std::min(1.0f, (cur.best_q - cur.played_q -
-                                          deblunderQBlunderThreshold) /
-                                                 deblunderQBlunderWidth +
-                                             0.5f);
-            }
-            // Instead of averaging, a randomization can be applied here with
-            // newZRatio = newZRatio > rand( [0, 1) ) ? 1.0f : 0.0f;
-            activeZ[0] = (1 - newZRatio) * activeZ[0] + newZRatio * cur.best_q;
-            activeZ[1] = (1 - newZRatio) * activeZ[1] + newZRatio * cur.best_d;
-            activeZ[2] = (1 - newZRatio) * activeZ[2] + newZRatio * cur.best_m;
-            deblunderingStarted = true;
-            blunders += 1;
-            /* std::cout << "Blunder detected. Best move q=" << cur.best_q <<
-             " played move q=" << cur.played_q; */
-          }
-          if (deblunderingStarted) {
-            /*
-            std::cerr << "Deblundering: "
-                      << fileContents[history.GetLength() - 1].best_q << " "
-                      << fileContents[history.GetLength() - 1].best_d << " "
-                      << (int)fileContents[history.GetLength() - 1].result << "
-            "
-                      << (int)activeZ << std::endl;
-                      */
-            fileContents[history.GetLength() - 1].result_q = activeZ[0];
-            fileContents[history.GetLength() - 1].result_d = activeZ[1];
-            fileContents[history.GetLength() - 1].plies_left = activeZ[2];
-          }
-          if (history.GetLength() == 1) break;
-          // Q values are always from the player to move.
-          activeZ[0] = -activeZ[0];
-          // Estimated remaining plies left has to be increased.
-          activeZ[2] += 1.0f;
-          history.Pop();
-        }
-      }
-      if (newInputFormat != -1) {
-        PopulateBoard(input_format, PlanesFromTrainingData(fileContents[0]),
-                      &board, &rule50ply, &gameply);
-        history.Reset(board, rule50ply, gameply);
-        ChangeInputFormat(newInputFormat, &fileContents[0], history);
-        for (size_t i = 0; i < moves.size(); i++) {
-          history.Append(moves[i]);
-          ChangeInputFormat(newInputFormat, &fileContents[i + 1], history);
-        }
-      }
-
-      if (!outputDir.empty()) {
-        std::string fileName = file.substr(file.find_last_of("/\\") + 1);
-        TrainingDataWriter writer(outputDir + "/" + fileName);
-        for (auto chunk : fileContents) {
-          // Don't save chunks that just provide move history.
-          if ((chunk.invariance_info & 64) == 0) {
-            writer.WriteChunk(chunk);
-          }
-        }
-      }
-
-      // Output data in Stockfish plain format.
-      if (!nnue_plain_file.empty()) {
-        static Mutex mutex;
-        std::ostringstream out;
-        pblczero::NetworkFormat::InputFormat format;
-        if (newInputFormat != -1) {
-          format =
-              static_cast<pblczero::NetworkFormat::InputFormat>(newInputFormat);
-        } else {
-          format = input_format;
-        }
-        PopulateBoard(format, PlanesFromTrainingData(fileContents[0]), &board,
-                      &rule50ply, &gameply);
-        history.Reset(board, rule50ply, gameply);
-        for (size_t i = 0; i < fileContents.size(); i++) {
-          auto chunk = fileContents[i];
-          Position p = history.Last();
-          if (chunk.visits > 0) {
-            // Format is v6 and position is evaluated.
-            Move m = MoveFromNNIndex(
-                flags.nnue_best_move ? chunk.best_idx : chunk.played_idx,
-                TransformForPosition(format, history));
-            float q = flags.nnue_best_score ? chunk.best_q : chunk.played_q;
-            out << AsNnueString(p, m, q, round(chunk.result_q));
-          } else if (i < moves.size()) {
-            out << AsNnueString(p, moves[i], chunk.best_q,
-                                round(chunk.result_q));
-          }
-          if (i < moves.size()) {
-            history.Append(moves[i]);
-          }
-        }
-        std::ofstream file;
-        Mutex::Lock lock(mutex);
-        file.open(nnue_plain_file, std::ios_base::app);
-        if (file.is_open()) {
-          file << out.str();
-          file.close();
-        }
-      }
-    } catch (Exception& ex) {
-      std::cerr << "While processing: " << file
-                << " - Exception thrown: " << ex.what() << std::endl;
-      if (flags.delete_files) {
-        std::cerr << "It will be deleted." << std::endl;
-      }
-    }
-  }
-  if (flags.delete_files) {
-    remove(file.c_str());
-  }
-}
-
-void ProcessFiles(const std::vector<std::string>& files,
-                  SyzygyTablebase* tablebase, std::string outputDir,
-                  float distTemp, float distOffset, float dtzBoost,
-                  int newInputFormat, int offset, int mod,
-                  std::string nnue_plain_file, ProcessFileFlags flags) {
-  std::cerr << "Thread: " << offset << " starting" << std::endl;
-  for (size_t i = offset; i < files.size(); i += mod) {
-    if (files[i].rfind(".gz") != files[i].size() - 3) {
-      std::cerr << "Skipping: " << files[i] << std::endl;
-      continue;
-    }
-    ProcessFile(files[i], tablebase, outputDir, distTemp, distOffset, dtzBoost,
-                newInputFormat, nnue_plain_file, flags);
-  }
-}
-
-void BuildSubs(const std::vector<std::string>& files) {
-  for (auto& file : files) {
-    TrainingDataReader reader(file);
-    std::vector<V6TrainingData> fileContents;
-    V6TrainingData data;
-    while (reader.ReadChunk(&data)) {
-      fileContents.push_back(data);
-    }
-    Validate(fileContents);
-    MoveList moves;
-    for (size_t i = 1; i < fileContents.size(); i++) {
-      moves.push_back(
-          DecodeMoveFromInput(PlanesFromTrainingData(fileContents[i]),
-                              PlanesFromTrainingData(fileContents[i - 1])));
-      // All moves decoded are from the point of view of the side after the
-      // move so need to mirror them all to be applicable to apply to the
-      // position before.
-      moves.back().Mirror();
-    }
-    Validate(fileContents, moves);
-
-    // Subs are 'valid'.
-    PositionHistory history;
-    int rule50ply;
-    int gameply;
-    ChessBoard board;
-    auto input_format = static_cast<pblczero::NetworkFormat::InputFormat>(
-        fileContents[0].input_format);
-    PopulateBoard(input_format, PlanesFromTrainingData(fileContents[0]), &board,
-                  &rule50ply, &gameply);
-    history.Reset(board, rule50ply, gameply);
-    uint64_t rootHash = HashCat(board.Hash(), rule50ply);
-    PolicySubNode* rootNode = &policy_subs[rootHash];
-    for (size_t i = 0; i < fileContents.size(); i++) {
-      if ((fileContents[i].invariance_info & 64) == 0) {
-        rootNode->active = true;
-        for (int j = 0; j < 1858; j++) {
-          rootNode->policy[j] = fileContents[i].probabilities[j];
-        }
-      }
-      if (i < fileContents.size() - 1) {
-        int transform = TransformForPosition(input_format, history);
-        int idx = moves[i].as_nn_index(transform);
-        if (rootNode->children[idx] == nullptr) {
-          rootNode->children[idx] = new PolicySubNode();
-        }
-        rootNode = rootNode->children[idx];
-        history.Append(moves[i]);
-      }
-    }
-  }
-}
-
-}  // namespace
-
-RescoreLoop::RescoreLoop() {}
-
-RescoreLoop::~RescoreLoop() {}
-
-#ifdef _WIN32
-#define SEP_CHAR ';'
-#else
-#define SEP_CHAR ':'
-#endif
-
-void RescoreLoop::RunLoop() {
-  orig_counts[0] = 0;
-  orig_counts[1] = 0;
-  orig_counts[2] = 0;
-  fixed_counts[0] = 0;
-  fixed_counts[1] = 0;
-  fixed_counts[2] = 0;
-  for (int i = 0; i < 11; i++) policy_bump_total_hist[i] = 0;
-  for (int i = 0; i < 11; i++) policy_nobump_total_hist[i] = 0;
-  options_.Add<StringOption>(kSyzygyTablebaseId);
-  options_.Add<StringOption>(kGaviotaTablebaseId);
-  options_.Add<StringOption>(kInputDirId);
-  options_.Add<StringOption>(kOutputDirId);
-  options_.Add<StringOption>(kPolicySubsDirId);
-  options_.Add<IntOption>(kThreadsId, 1, 20) = 1;
-  options_.Add<FloatOption>(kTempId, 0.001, 100) = 1;
-  // Positive dist offset requires knowing the legal move set, so not supported
-  // for now.
-  options_.Add<FloatOption>(kDistributionOffsetId, -0.999, 0) = 0;
-  options_.Add<FloatOption>(kMinDTZBoostId, 0, 1) = 0;
-  options_.Add<IntOption>(kNewInputFormatId, -1, 256) = -1;
-  options_.Add<BoolOption>(kDeblunder) = false;
-  options_.Add<FloatOption>(kDeblunderQBlunderThreshold, 0.0f, 2.0f) = 2.0f;
-  options_.Add<FloatOption>(kDeblunderQBlunderWidth, 0.0f, 2.0f) = 0.0f;
-  options_.Add<StringOption>(kNnuePlainFileId);
-  options_.Add<BoolOption>(kNnueBestScoreId) = true;
-  options_.Add<BoolOption>(kNnueBestMoveId) = false;
-  options_.Add<BoolOption>(kDeleteFilesId) = true;
-
-  if (!options_.ProcessAllFlags()) return;
-
-  if (options_.GetOptionsDict().IsDefault<std::string>(kOutputDirId) &&
-      options_.GetOptionsDict().IsDefault<std::string>(kNnuePlainFileId)) {
-    std::cerr << "Must provide an output dir or NNUE plain file." << std::endl;
-    return;
-  }
-
-  deblunderEnabled = options_.GetOptionsDict().Get<bool>(kDeblunder);
-  deblunderQBlunderThreshold =
-      options_.GetOptionsDict().Get<float>(kDeblunderQBlunderThreshold);
-  deblunderQBlunderWidth =
-      options_.GetOptionsDict().Get<float>(kDeblunderQBlunderWidth);
-
-  SyzygyTablebase tablebase;
-  if (!tablebase.init(
-          options_.GetOptionsDict().Get<std::string>(kSyzygyTablebaseId)) ||
-      tablebase.max_cardinality() < 3) {
-    std::cerr << "FAILED TO LOAD SYZYGY" << std::endl;
-    return;
-  }
-  auto dtmPaths =
-      options_.GetOptionsDict().Get<std::string>(kGaviotaTablebaseId);
-  if (dtmPaths.size() != 0) {
-    std::stringstream path_string_stream(dtmPaths);
-    std::string path;
-    auto paths = tbpaths_init();
-    while (std::getline(path_string_stream, path, SEP_CHAR)) {
-      paths = tbpaths_add(paths, path.c_str());
-    }
-    tb_init(0, tb_CP4, paths);
-    tbcache_init(64 * 1024 * 1024, 64);
-    if (tb_availability() != 63) {
-      std::cerr << "UNEXPECTED gaviota availability" << std::endl;
-      return;
-    } else {
-      std::cerr << "Found Gaviota TBs" << std::endl;
-    }
-    gaviotaEnabled = true;
-  }
-  auto policySubsDir =
-      options_.GetOptionsDict().Get<std::string>(kPolicySubsDirId);
-  if (policySubsDir.size() != 0) {
-    auto policySubFiles = GetFileList(policySubsDir);
-    for (size_t i = 0; i < policySubFiles.size(); i++) {
-      policySubFiles[i] = policySubsDir + "/" + policySubFiles[i];
-    }
-    BuildSubs(policySubFiles);
-  }
-
-  auto inputDir = options_.GetOptionsDict().Get<std::string>(kInputDirId);
-  if (inputDir.size() == 0) {
-    std::cerr << "Must provide an input dir." << std::endl;
-    return;
-  }
-  auto files = GetFileList(inputDir);
-  if (files.size() == 0) {
-    std::cerr << "No files to process" << std::endl;
-    return;
-  }
-  for (size_t i = 0; i < files.size(); i++) {
-    files[i] = inputDir + "/" + files[i];
-  }
-  float dtz_boost = options_.GetOptionsDict().Get<float>(kMinDTZBoostId);
-  unsigned int threads = options_.GetOptionsDict().Get<int>(kThreadsId);
-  ProcessFileFlags flags;
-  flags.delete_files = options_.GetOptionsDict().Get<bool>(kDeleteFilesId);
-  flags.nnue_best_score = options_.GetOptionsDict().Get<bool>(kNnueBestScoreId);
-  flags.nnue_best_move = options_.GetOptionsDict().Get<bool>(kNnueBestMoveId);
-  if (threads > 1) {
-    std::vector<std::thread> threads_;
-    int offset = 0;
-    while (threads_.size() < threads) {
-      int offset_val = offset;
-      offset++;
-      threads_.emplace_back([this, offset_val, files, &tablebase, threads,
-                             dtz_boost, flags]() {
-        ProcessFiles(
-            files, &tablebase,
-            options_.GetOptionsDict().Get<std::string>(kOutputDirId),
-            options_.GetOptionsDict().Get<float>(kTempId),
-            options_.GetOptionsDict().Get<float>(kDistributionOffsetId),
-            dtz_boost, options_.GetOptionsDict().Get<int>(kNewInputFormatId),
-            offset_val, threads,
-            options_.GetOptionsDict().Get<std::string>(kNnuePlainFileId),
-            flags);
-      });
-    }
-    for (size_t i = 0; i < threads_.size(); i++) {
-      threads_[i].join();
-    }
-
-  } else {
-    ProcessFiles(files, &tablebase,
-                 options_.GetOptionsDict().Get<std::string>(kOutputDirId),
-                 options_.GetOptionsDict().Get<float>(kTempId),
-                 options_.GetOptionsDict().Get<float>(kDistributionOffsetId),
-                 dtz_boost,
-                 options_.GetOptionsDict().Get<int>(kNewInputFormatId), 0, 1,
-                 options_.GetOptionsDict().Get<std::string>(kNnuePlainFileId),
-                 flags);
-  }
-  std::cout << "Games processed: " << games << std::endl;
-  std::cout << "Positions processed: " << positions << std::endl;
-  std::cout << "Rescores performed: " << rescored << std::endl;
-  std::cout << "Cumulative outcome change: " << delta << std::endl;
-  std::cout << "Secondary rescores performed: " << rescored2 << std::endl;
-  std::cout << "Secondary rescores performed used dtz: " << rescored3
-            << std::endl;
-  std::cout << "Blunders picked up by deblunder threshold: " << blunders
-            << std::endl;
-  std::cout << "Number of policy values boosted by dtz or dtm " << policy_bump
-            << std::endl;
-  std::cout << "Number of policy values boosted by dtm " << policy_dtm_bump
-            << std::endl;
-  std::cout << "Orig policy_sum dist of boost candidate:";
-  std::cout << std::endl;
-  int event_sum = 0;
-  for (int i = 0; i < 11; i++) event_sum += policy_bump_total_hist[i];
-  for (int i = 0; i < 11; i++) {
-    std::cout << " " << std::setprecision(4)
-              << ((float)policy_nobump_total_hist[i] / (float)event_sum);
-  }
-  std::cout << std::endl;
-  std::cout << "Boosted policy_sum dist of boost candidate:";
-  std::cout << std::endl;
-  for (int i = 0; i < 11; i++) {
-    std::cout << " " << std::setprecision(4)
-              << ((float)policy_bump_total_hist[i] / (float)event_sum);
-  }
-  std::cout << std::endl;
-  std::cout << "Original L: " << orig_counts[0] << " D: " << orig_counts[1]
-            << " W: " << orig_counts[2] << std::endl;
-  std::cout << "After L: " << fixed_counts[0] << " D: " << fixed_counts[1]
-            << " W: " << fixed_counts[2] << std::endl;
-  std::cout << "Gaviota DTM move_count rescores: " << gaviota_dtm_rescores
-            << std::endl;
-}
-
-}  // namespace lczero
diff --git a/src/rescorer_main.cc b/src/rescorer_main.cc
index f7acee7853..7bb367dbed 100644
--- a/src/rescorer_main.cc
+++ b/src/rescorer_main.cc
@@ -28,7 +28,7 @@
 #include <iostream>
 
 #include "chess/board.h"
-#include "rescorer/rescoreloop.h"
+#include "trainingdata/rescorer.h"
 #include "utils/commandline.h"
 #include "utils/esc_codes.h"
 #include "utils/exception.h"
@@ -53,8 +53,7 @@ int main(int argc, const char** argv) {
 
     // Consuming optional "rescore" mode.
     CommandLine::ConsumeCommand("rescore");
-    RescoreLoop loop;
-    loop.RunLoop();
+    RunRescorer();
   } catch (std::exception& e) {
     std::cerr << "Unhandled exception: " << e.what() << std::endl;
     abort();
diff --git a/src/search/artifacts.h b/src/search/artifacts.h
new file mode 100644
index 0000000000..2ef68c5cf5
--- /dev/null
+++ b/src/search/artifacts.h
@@ -0,0 +1,37 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2024 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#pragma once
+
+namespace lczero {
+
+// Contains the search artifacts that are needed e.g. to build the training
+// data. The selfplay loop would fetch this from search to build training data
+// frames.
+struct SearchArtifacts {};
+
+}  // namespace lczero
\ No newline at end of file
diff --git a/src/mcts/node.cc b/src/search/classic/node.cc
similarity index 94%
rename from src/mcts/node.cc
rename to src/search/classic/node.cc
index c3ec4b010e..d473d16a90 100644
--- a/src/mcts/node.cc
+++ b/src/search/classic/node.cc
@@ -25,7 +25,7 @@
   Program grant you additional permission to convey the resulting work.
 */
 
-#include "mcts/node.h"
+#include "search/classic/node.h"
 
 #include <algorithm>
 #include <cassert>
@@ -41,6 +41,7 @@
 #include "utils/hashcat.h"
 
 namespace lczero {
+namespace classic {
 
 /////////////////////////////////////////////////////////////////////////
 // Node garbage collector
@@ -123,7 +124,7 @@ NodeGarbageCollector gNodeGc;
 Move Edge::GetMove(bool as_opponent) const {
   if (!as_opponent) return move_;
   Move m = move_;
-  m.Mirror();
+  m.Flip();
   return m;
 }
 
@@ -176,7 +177,8 @@ float Edge::GetP() const {
 
 std::string Edge::DebugString() const {
   std::ostringstream oss;
-  oss << "Move: " << move_.as_string() << " p_: " << p_ << " GetP: " << GetP();
+  oss << "Move: " << move_.ToString(true) << " p_: " << p_
+      << " GetP: " << GetP();
   return oss.str();
 }
 
@@ -236,8 +238,7 @@ std::string Node::DebugString() const {
       << " WL:" << wl_ << " N:" << n_ << " N_:" << n_in_flight_
       << " Edges:" << static_cast<int>(num_edges_)
       << " Bounds:" << static_cast<int>(lower_bound_) - 2 << ","
-      << static_cast<int>(upper_bound_) - 2
-      << " Solid:" << solid_children_;
+      << static_cast<int>(upper_bound_) - 2 << " Solid:" << solid_children_;
   return oss.str();
 }
 
@@ -274,7 +275,8 @@ bool Node::MakeSolid() {
   while (old_child) {
     int index = old_child->index_;
     new_children[index] = std::move(*old_child.get());
-    // This isn't needed, but it helps crash things faster if something has gone wrong.
+    // This isn't needed, but it helps crash things faster if something has gone
+    // wrong.
     old_child->parent_ = nullptr;
     gNodeGc.AddToGcQueue(std::move(old_child));
     new_children[index].UpdateChildrenParents();
@@ -349,9 +351,7 @@ bool Node::TryStartScoreUpdate() {
   return true;
 }
 
-void Node::CancelScoreUpdate(int multivisit) {
-  n_in_flight_ -= multivisit;
-}
+void Node::CancelScoreUpdate(int multivisit) { n_in_flight_ -= multivisit; }
 
 void Node::FinalizeScoreUpdate(float v, float d, float m, int multivisit) {
   // Recompute Q.
@@ -463,12 +463,9 @@ std::string EdgeAndNode::DebugString() const {
 /////////////////////////////////////////////////////////////////////////
 
 void NodeTree::MakeMove(Move move) {
-  if (HeadPosition().IsBlackToMove()) move.Mirror();
-  const auto& board = HeadPosition().GetBoard();
-
   Node* new_head = nullptr;
   for (auto& n : current_head_->Edges()) {
-    if (board.IsSameMove(n.GetMove(), move)) {
+    if (n.GetMove() == move) {
       new_head = n.GetOrSpawnNode(current_head_);
       // Ensure head is not terminal, so search can extend or visit children of
       // "terminal" positions, e.g., WDL hits, converted terminals, 3-fold draw.
@@ -476,7 +473,6 @@ void NodeTree::MakeMove(Move move) {
       break;
     }
   }
-  move = board.GetModernMove(move);
   current_head_->ReleaseChildrenExceptOne(new_head);
   new_head = current_head_->child_.get();
   current_head_ =
@@ -494,15 +490,8 @@ void NodeTree::TrimTreeAtHead() {
   current_head_->sibling_ = std::move(tmp);
 }
 
-bool NodeTree::ResetToPosition(const std::string& starting_fen,
-                               const std::vector<Move>& moves) {
-  ChessBoard starting_board;
-  int no_capture_ply;
-  int full_moves;
-  starting_board.SetFromFen(starting_fen, &no_capture_ply, &full_moves);
-  if (gamebegin_node_ &&
-      (history_.Starting().GetBoard() != starting_board ||
-       history_.Starting().GetRule50Ply() != no_capture_ply)) {
+bool NodeTree::ResetToPosition(const GameState& pos) {
+  if (gamebegin_node_ && (history_.Starting() != pos.startpos)) {
     // Completely different position.
     DeallocateTree();
   }
@@ -511,14 +500,13 @@ bool NodeTree::ResetToPosition(const std::string& starting_fen,
     gamebegin_node_ = std::make_unique<Node>(nullptr, 0);
   }
 
-  history_.Reset(starting_board, no_capture_ply,
-                 full_moves * 2 - (starting_board.flipped() ? 1 : 2));
+  history_.Reset(pos.startpos);
 
   Node* old_head = current_head_;
   current_head_ = gamebegin_node_.get();
   bool seen_old_head = (gamebegin_node_.get() == old_head);
-  for (const auto& move : moves) {
-    MakeMove(move);
+  for (const Move m : pos.moves) {
+    MakeMove(m);
     if (old_head == current_head_) seen_old_head = true;
   }
 
@@ -531,6 +519,21 @@ bool NodeTree::ResetToPosition(const std::string& starting_fen,
   return seen_old_head;
 }
 
+bool NodeTree::ResetToPosition(const std::string& starting_fen,
+                               const std::vector<std::string>& moves) {
+  GameState state;
+  state.startpos = Position::FromFen(starting_fen);
+  ChessBoard cur_board = state.startpos.GetBoard();
+  state.moves.reserve(moves.size());
+  for (const auto& move : moves) {
+    Move m = cur_board.ParseMove(move);
+    state.moves.push_back(m);
+    cur_board.ApplyMove(m);
+    cur_board.Mirror();
+  }
+  return ResetToPosition(state);
+}
+
 void NodeTree::DeallocateTree() {
   // Same as gamebegin_node_.reset(), but actual deallocation will happen in
   // GC thread.
@@ -539,4 +542,5 @@ void NodeTree::DeallocateTree() {
   current_head_ = nullptr;
 }
 
+}  // namespace classic
 }  // namespace lczero
diff --git a/src/mcts/node.h b/src/search/classic/node.h
similarity index 98%
rename from src/mcts/node.h
rename to src/search/classic/node.h
index 2982de24da..8a4e598fdb 100644
--- a/src/mcts/node.h
+++ b/src/search/classic/node.h
@@ -35,13 +35,14 @@
 
 #include "chess/board.h"
 #include "chess/callbacks.h"
+#include "chess/gamestate.h"
 #include "chess/position.h"
-#include "neural/cache.h"
 #include "neural/encoder.h"
 #include "proto/net.pb.h"
 #include "utils/mutex.h"
 
 namespace lczero {
+namespace classic {
 
 // Children of a node are stored the following way:
 // * Edges and Nodes edges point to are stored separately.
@@ -438,6 +439,11 @@ class Edge_Iterator : public EdgeAndNode {
  public:
   using Ptr = std::conditional_t<is_const, const std::unique_ptr<Node>*,
                                  std::unique_ptr<Node>*>;
+  using value_type = Edge_Iterator;
+  using iterator_category = std::forward_iterator_tag;
+  using difference_type = std::ptrdiff_t;
+  using pointer = Edge_Iterator*;
+  using reference = Edge_Iterator&;
 
   // Creates "end()" iterator.
   Edge_Iterator() {}
@@ -642,7 +648,8 @@ class NodeTree {
   // moves added). Returns false, if the position is completely different,
   // or if it's shorter than before.
   bool ResetToPosition(const std::string& starting_fen,
-                       const std::vector<Move>& moves);
+                       const std::vector<std::string>& moves);
+  bool ResetToPosition(const GameState& pos);
   const Position& HeadPosition() const { return history_.Last(); }
   int GetPlyCount() const { return HeadPosition().GetGamePly(); }
   bool IsBlackToMove() const { return HeadPosition().IsBlackToMove(); }
@@ -659,4 +666,5 @@ class NodeTree {
   PositionHistory history_;
 };
 
+}  // namespace classic
 }  // namespace lczero
diff --git a/src/mcts/params.cc b/src/search/classic/params.cc
similarity index 67%
rename from src/mcts/params.cc
rename to src/search/classic/params.cc
index 22310477e1..e61b0f9c88 100644
--- a/src/mcts/params.cc
+++ b/src/search/classic/params.cc
@@ -1,6 +1,6 @@
 /*
   This file is part of Leela Chess Zero.
-  Copyright (C) 2018-2023 The LCZero Authors
+  Copyright (C) 2018-2025 The LCZero Authors
 
   Leela Chess is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -25,12 +25,13 @@
   Program grant you additional permission to convey the resulting work.
 */
 
-#include "mcts/params.h"
+#include "search/classic/params.h"
 
 #include <algorithm>
 #include <cctype>
 #include <cmath>
 
+#include "neural/shared_params.h"
 #include "utils/exception.h"
 #include "utils/string.h"
 
@@ -41,11 +42,9 @@
 #ifndef DEFAULT_MAX_PREFETCH
 #define DEFAULT_MAX_PREFETCH 32
 #endif
-#ifndef DEFAULT_TASK_WORKERS
-#define DEFAULT_TASK_WORKERS 4
-#endif
 
 namespace lczero {
+namespace classic {
 
 namespace {
 FillEmptyHistory EncodeHistoryFill(std::string history_fill) {
@@ -90,7 +89,7 @@ float GetContempt(std::string name, std::string contempt_str,
 // Calculate ratio and diff for WDL conversion from the contempt settings.
 // More accurate model, allowing book bias dependent Elo calculation.
 // Doesn't take lower accuracy of opponent into account and needs clamping.
-SearchParams::WDLRescaleParams AccurateWDLRescaleParams(
+BaseSearchParams::WDLRescaleParams AccurateWDLRescaleParams(
     float contempt, float draw_rate_target, float draw_rate_reference,
     float book_exit_bias, float contempt_max, float contempt_attenuation) {
   // Catch accidental low positive values of draw_rate_target to guarantee
@@ -113,7 +112,7 @@ SearchParams::WDLRescaleParams AccurateWDLRescaleParams(
            std::pow(std::cosh(0.5f * (1 + book_exit_bias) / scale_target), 2)) *
       std::log(10) / 200 * std::clamp(contempt, -contempt_max, contempt_max) *
       contempt_attenuation;
-  return SearchParams::WDLRescaleParams(ratio, diff);
+  return BaseSearchParams::WDLRescaleParams(ratio, diff);
 }
 
 // Converts regular Elo into ideal UHO game pair Elo based on the same Elo
@@ -135,7 +134,7 @@ float ConvertRegularToGamePairElo(float elo_regular) {
 // Less accurate Elo model, but automatically chooses draw rate and accuracy
 // based on the absolute Elo of both sides. Doesn't require clamping, but still
 // uses the parameter.
-SearchParams::WDLRescaleParams SimplifiedWDLRescaleParams(
+BaseSearchParams::WDLRescaleParams SimplifiedWDLRescaleParams(
     float contempt, float draw_rate_reference, float elo_active,
     float contempt_max, float contempt_attenuation) {
   // Scale parameter of the logistic WDL distribution is fitted as a sigmoid,
@@ -172,328 +171,379 @@ SearchParams::WDLRescaleParams SimplifiedWDLRescaleParams(
       std::log(1.0f + std::exp(-elo_opp / elo_slope + offset) / scale_zero);
   float diff = 1.0f / (scale_reference * scale_reference) *
                (mu_active - mu_opp) * contempt_attenuation;
-  return SearchParams::WDLRescaleParams(ratio, diff);
+  return BaseSearchParams::WDLRescaleParams(ratio, diff);
 }
 }  // namespace
 
-const OptionId SearchParams::kMiniBatchSizeId{
+const OptionId BaseSearchParams::kMiniBatchSizeId{
     "minibatch-size", "MinibatchSize",
     "How many positions the engine tries to batch together for parallel NN "
     "computation. Larger batches may reduce strength a bit, especially with a "
     "small number of playouts. Set to 0 to use a backend suggested value."};
-const OptionId SearchParams::kMaxPrefetchBatchId{
-    "max-prefetch", "MaxPrefetch",
-    "When the engine cannot gather a large enough batch for immediate use, try "
-    "to prefetch up to X positions which are likely to be useful soon, and put "
-    "them into cache."};
-const OptionId SearchParams::kCpuctId{
+const OptionId BaseSearchParams::kCpuctId{
     "cpuct", "CPuct",
     "cpuct_init constant from \"UCT search\" algorithm. Higher values promote "
     "more exploration/wider search, lower values promote more "
     "confidence/deeper search."};
-const OptionId SearchParams::kCpuctAtRootId{
-    "cpuct-at-root", "CPuctAtRoot",
-    "cpuct_init constant from \"UCT search\" algorithm, for root node."};
-const OptionId SearchParams::kCpuctBaseId{
+const OptionId BaseSearchParams::kCpuctAtRootId{
+    {.long_flag = "cpuct-at-root",
+     .uci_option = "CPuctAtRoot",
+     .help_text =
+         "cpuct_init constant from \"UCT search\" algorithm, for root node.",
+     .visibility = OptionId::kProOnly}};
+const OptionId BaseSearchParams::kCpuctBaseId{
     "cpuct-base", "CPuctBase",
     "cpuct_base constant from \"UCT search\" algorithm. Lower value means "
     "higher growth of Cpuct as number of node visits grows."};
-const OptionId SearchParams::kCpuctBaseAtRootId{
-    "cpuct-base-at-root", "CPuctBaseAtRoot",
-    "cpuct_base constant from \"UCT search\" algorithm, for root node."};
-const OptionId SearchParams::kCpuctFactorId{
+const OptionId BaseSearchParams::kCpuctBaseAtRootId{
+    {.long_flag = "cpuct-base-at-root",
+     .uci_option = "CPuctBaseAtRoot",
+     .help_text =
+         "cpuct_base constant from \"UCT search\" algorithm, for root node.",
+     .visibility = OptionId::kProOnly}};
+const OptionId BaseSearchParams::kCpuctFactorId{
     "cpuct-factor", "CPuctFactor", "Multiplier for the cpuct growth formula."};
-const OptionId SearchParams::kCpuctFactorAtRootId{
-    "cpuct-factor-at-root", "CPuctFactorAtRoot",
-    "Multiplier for the cpuct growth formula at root."};
+const OptionId BaseSearchParams::kCpuctFactorAtRootId{
+    {.long_flag = "cpuct-factor-at-root",
+     .uci_option = "CPuctFactorAtRoot",
+     .help_text = "Multiplier for the cpuct growth formula at root.",
+     .visibility = OptionId::kProOnly}};
 // Remove this option after 0.25 has been made mandatory in training and the
 // training server stops sending it.
-const OptionId SearchParams::kRootHasOwnCpuctParamsId{
-    "root-has-own-cpuct-params", "RootHasOwnCpuctParams",
-    "If enabled, cpuct parameters for root node are taken from *AtRoot "
-    "parameters. Otherwise, they are the same as for the rest of nodes. "
-    "Temporary flag for transition to a new version."};
-const OptionId SearchParams::kTwoFoldDrawsId{
+const OptionId BaseSearchParams::kRootHasOwnCpuctParamsId{
+    {.long_flag = "root-has-own-cpuct-params",
+     .uci_option = "RootHasOwnCpuctParams",
+     .help_text =
+         "If enabled, cpuct parameters for root node are taken from *AtRoot "
+         "parameters. Otherwise, they are the same as for the rest of nodes. "
+         "Temporary flag for transition to a new version.",
+     .visibility = OptionId::kProOnly}};
+const OptionId BaseSearchParams::kTwoFoldDrawsId{
     "two-fold-draws", "TwoFoldDraws",
     "Evaluates twofold repetitions in the search tree as draws. Visits to "
-    "these positions are reverted when the first occurrence is played "
-    "and not in the search tree anymore."};
-const OptionId SearchParams::kTemperatureId{
-    "temperature", "Temperature",
-    "Tau value from softmax formula for the first move. If equal to 0, the "
-    "engine picks the best move to make. Larger values increase randomness "
-    "while making the move."};
-const OptionId SearchParams::kTempDecayMovesId{
-    "tempdecay-moves", "TempDecayMoves",
-    "Reduce temperature for every move after the first move, decreasing "
-    "linearly over this number of moves from initial temperature to 0. "
-    "A value of 0 disables tempdecay."};
-const OptionId SearchParams::kTempDecayDelayMovesId{
-    "tempdecay-delay-moves", "TempDecayDelayMoves",
-    "Delay the linear decrease of temperature by this number of moves, "
-    "decreasing linearly from initial temperature to 0. A value of 0 starts "
-    "tempdecay after the first move."};
-const OptionId SearchParams::kTemperatureCutoffMoveId{
-    "temp-cutoff-move", "TempCutoffMove",
-    "Move number, starting from which endgame temperature is used rather "
-    "than initial temperature. Setting it to 0 disables cutoff."};
-const OptionId SearchParams::kTemperatureEndgameId{
-    "temp-endgame", "TempEndgame",
-    "Temperature used during endgame (starting from cutoff move). Endgame "
-    "temperature doesn't decay."};
-const OptionId SearchParams::kTemperatureWinpctCutoffId{
-    "temp-value-cutoff", "TempValueCutoff",
-    "When move is selected using temperature, bad moves (with win "
-    "probability less than X than the best move) are not considered at all."};
-const OptionId SearchParams::kTemperatureVisitOffsetId{
-    "temp-visit-offset", "TempVisitOffset",
-    "Adjusts visits by this value when picking a move with a temperature. If a "
-    "negative offset reduces visits for a particular move below zero, that "
-    "move is not picked. If no moves can be picked, no temperature is used."};
-const OptionId SearchParams::kNoiseEpsilonId{
-    "noise-epsilon", "DirichletNoiseEpsilon",
-    "Amount of Dirichlet noise to combine with root priors. This allows the "
-    "engine to discover new ideas during training by exploring moves which are "
-    "known to be bad. Not normally used during play."};
-const OptionId SearchParams::kNoiseAlphaId{
-    "noise-alpha", "DirichletNoiseAlpha",
-    "Alpha of Dirichlet noise to control the sharpness of move probabilities. "
-    "Larger values result in flatter / more evenly distributed values."};
-const OptionId SearchParams::kVerboseStatsId{
+    "these positions are reverted when the first occurrence is played and not "
+    "in the search tree anymore."};
+const OptionId BaseSearchParams::kTemperatureId{
+    {.long_flag = "temperature",
+     .uci_option = "Temperature",
+     .help_text = "Tau value from softmax formula for the first move. If equal "
+                  "to 0, the engine picks the best move to make. Larger values "
+                  "increase randomness while making the move.",
+     .visibility = OptionId::kProOnly}};
+const OptionId BaseSearchParams::kTempDecayMovesId{
+    {.long_flag = "tempdecay-moves",
+     .uci_option = "TempDecayMoves",
+     .help_text = "Reduce temperature for every move after the first move, "
+                  "decreasing linearly over this number of moves from initial "
+                  "temperature to 0. A value of 0 disables tempdecay.",
+     .visibility = OptionId::kProOnly}};
+const OptionId BaseSearchParams::kTempDecayDelayMovesId{
+    {.long_flag = "tempdecay-delay-moves",
+     .uci_option = "TempDecayDelayMoves",
+     .help_text = "Delay the linear decrease of temperature by this number of "
+                  "moves, decreasing linearly from initial temperature to 0. A "
+                  "value of 0 starts tempdecay after the first move.",
+     .visibility = OptionId::kProOnly}};
+const OptionId BaseSearchParams::kTemperatureCutoffMoveId{
+    {.long_flag = "temp-cutoff-move",
+     .uci_option = "TempCutoffMove",
+     .help_text =
+         "Move number, starting from which endgame temperature is used rather "
+         "than initial temperature. Setting it to 0 disables cutoff.",
+     .visibility = OptionId::kProOnly}};
+const OptionId BaseSearchParams::kTemperatureEndgameId{
+    {.long_flag = "temp-endgame",
+     .uci_option = "TempEndgame",
+     .help_text = "Temperature used during endgame (starting from cutoff "
+                  "move). Endgame temperature doesn't decay.",
+     .visibility = OptionId::kProOnly}};
+const OptionId BaseSearchParams::kTemperatureWinpctCutoffId{
+    {.long_flag = "temp-value-cutoff",
+     .uci_option = "TempValueCutoff",
+     .help_text = "When move is selected using temperature, bad moves (with "
+                  "win probability less than X than the best move) are not "
+                  "considered at all.",
+     .visibility = OptionId::kProOnly}};
+const OptionId BaseSearchParams::kTemperatureVisitOffsetId{
+    {.long_flag = "temp-visit-offset",
+     .uci_option = "TempVisitOffset",
+     .help_text = "Adjusts visits by this value when picking a move with a "
+                  "temperature. If a negative offset reduces visits for a "
+                  "particular move below zero, that move is not picked. If no "
+                  "moves can be picked, no temperature is used.",
+     .visibility = OptionId::kProOnly}};
+const OptionId BaseSearchParams::kNoiseEpsilonId{
+    {.long_flag = "noise-epsilon",
+     .uci_option = "DirichletNoiseEpsilon",
+     .help_text =
+         "Amount of Dirichlet noise to combine with root priors. This allows "
+         "the engine to discover new ideas during training by exploring moves "
+         "which are known to be bad. Not normally used during play.",
+     .visibility = OptionId::kProOnly}};
+const OptionId BaseSearchParams::kNoiseAlphaId{
+    {.long_flag = "noise-alpha",
+     .uci_option = "DirichletNoiseAlpha",
+     .help_text = "Alpha of Dirichlet noise to control the sharpness of move "
+                  "probabilities. Larger values result in flatter / more "
+                  "evenly distributed values.",
+     .visibility = OptionId::kProOnly}};
+const OptionId BaseSearchParams::kVerboseStatsId{
     "verbose-move-stats", "VerboseMoveStats",
     "Display Q, V, N, U and P values of every move candidate after each move.",
     'v'};
-const OptionId SearchParams::kLogLiveStatsId{
-    "log-live-stats", "LogLiveStats",
-    "Do VerboseMoveStats on every info update."};
-const OptionId SearchParams::kFpuStrategyId{
+const OptionId BaseSearchParams::kLogLiveStatsId{
+    {.long_flag = "log-live-stats",
+     .uci_option = "LogLiveStats",
+     .help_text = "Do VerboseMoveStats on every info update.",
+     .visibility = OptionId::kProOnly}};
+const OptionId BaseSearchParams::kFpuStrategyId{
     "fpu-strategy", "FpuStrategy",
     "How is an eval of unvisited node determined. \"First Play Urgency\" "
     "changes search behavior to visit unvisited nodes earlier or later by "
     "using a placeholder eval before checking the network. The value specified "
     "with --fpu-value results in \"reduction\" subtracting that value from the "
     "parent eval while \"absolute\" directly uses that value."};
-const OptionId SearchParams::kFpuValueId{
+const OptionId BaseSearchParams::kFpuValueId{
     "fpu-value", "FpuValue",
     "\"First Play Urgency\" value used to adjust unvisited node eval based on "
     "--fpu-strategy."};
-const OptionId SearchParams::kFpuStrategyAtRootId{
-    "fpu-strategy-at-root", "FpuStrategyAtRoot",
-    "How is an eval of unvisited root children determined. Just like "
-    "--fpu-strategy except only at the root level and adjusts unvisited root "
-    "children eval with --fpu-value-at-root. In addition to matching the "
-    "strategies from --fpu-strategy, this can be \"same\" to disable the "
-    "special root behavior."};
-const OptionId SearchParams::kFpuValueAtRootId{
-    "fpu-value-at-root", "FpuValueAtRoot",
-    "\"First Play Urgency\" value used to adjust unvisited root children eval "
-    "based on --fpu-strategy-at-root. Has no effect if --fpu-strategy-at-root "
-    "is \"same\"."};
-const OptionId SearchParams::kCacheHistoryLengthId{
+const OptionId BaseSearchParams::kFpuStrategyAtRootId{
+    {.long_flag = "fpu-strategy-at-root",
+     .uci_option = "FpuStrategyAtRoot",
+     .help_text =
+         "How is an eval of unvisited root children determined. Just like "
+         "--fpu-strategy except only at the root level and adjusts unvisited "
+         "root children eval with --fpu-value-at-root. In addition to matching "
+         "the strategies from --fpu-strategy, this can be \"same\" to disable "
+         "the special root behavior.",
+     .visibility = OptionId::kProOnly}};
+const OptionId BaseSearchParams::kFpuValueAtRootId{
+    {.long_flag = "fpu-value-at-root",
+     .uci_option = "FpuValueAtRoot",
+     .help_text = "\"First Play Urgency\" value used to adjust unvisited root "
+                  "children eval based on --fpu-strategy-at-root. Has no "
+                  "effect if --fpu-strategy-at-root is \"same\".",
+     .visibility = OptionId::kProOnly}};
+const OptionId BaseSearchParams::kCacheHistoryLengthId{
     "cache-history-length", "CacheHistoryLength",
     "Length of history, in half-moves, to include into the cache key. When "
     "this value is less than history that NN uses to eval a position, it's "
     "possble that the search will use eval of the same position with different "
     "history taken from cache."};
-const OptionId SearchParams::kPolicySoftmaxTempId{
-    "policy-softmax-temp", "PolicyTemperature",
-    "Policy softmax temperature. Higher values make priors of move candidates "
-    "closer to each other, widening the search."};
-const OptionId SearchParams::kMaxCollisionVisitsId{
+const OptionId BaseSearchParams::kMaxCollisionVisitsId{
     "max-collision-visits", "MaxCollisionVisits",
     "Total allowed node collision visits, per batch."};
-const OptionId SearchParams::kMaxCollisionEventsId{
+const OptionId BaseSearchParams::kMaxCollisionEventsId{
     "max-collision-events", "MaxCollisionEvents",
     "Allowed node collision events, per batch."};
-const OptionId SearchParams::kOutOfOrderEvalId{
+const OptionId BaseSearchParams::kOutOfOrderEvalId{
     "out-of-order-eval", "OutOfOrderEval",
     "During the gathering of a batch for NN to eval, if position happens to be "
     "in the cache or is terminal, evaluate it right away without sending the "
     "batch to the NN. When off, this may only happen with the very first node "
     "of a batch; when on, this can happen with any node."};
-const OptionId SearchParams::kMaxOutOfOrderEvalsFactorId{
+const OptionId BaseSearchParams::kMaxOutOfOrderEvalsFactorId{
     "max-out-of-order-evals-factor", "MaxOutOfOrderEvalsFactor",
     "Maximum number of out of order evals during gathering of a batch is "
     "calculated by multiplying the maximum batch size by this number."};
-const OptionId SearchParams::kStickyEndgamesId{
+const OptionId BaseSearchParams::kStickyEndgamesId{
     "sticky-endgames", "StickyEndgames",
     "When an end of game position is found during search, allow the eval of "
     "the previous move's position to stick to something more accurate. For "
     "example, if at least one move results in checkmate, then the position "
     "should stick as checkmated. Similarly, if all moves are drawn or "
     "checkmated, the position should stick as drawn or checkmate."};
-const OptionId SearchParams::kSyzygyFastPlayId{
+const OptionId BaseSearchParams::kSyzygyFastPlayId{
     "syzygy-fast-play", "SyzygyFastPlay",
     "With DTZ tablebase files, only allow the network pick from winning moves "
     "that have shortest DTZ to play faster (but not necessarily optimally)."};
-const OptionId SearchParams::kMultiPvId{
-    "multipv", "MultiPV",
-    "Number of game play lines (principal variations) to show in UCI info "
-    "output."};
-const OptionId SearchParams::kPerPvCountersId{
+const OptionId BaseSearchParams::kMultiPvId{
+    {.long_flag = "multipv",
+     .uci_option = "MultiPV",
+     .help_text = "Number of game play lines (principal variations) to show in "
+                  "UCI info output.",
+     .visibility = OptionId::kAlwaysVisible}};
+const OptionId BaseSearchParams::kPerPvCountersId{
     "per-pv-counters", "PerPVCounters",
     "Show node counts per principal variation instead of total nodes in UCI."};
-const OptionId SearchParams::kScoreTypeId{
+const OptionId BaseSearchParams::kScoreTypeId{
     "score-type", "ScoreType",
     "What to display as score. Either centipawns (the UCI default), win "
     "percentage or Q (the actual internal score) multiplied by 100."};
-const OptionId SearchParams::kHistoryFillId{
-    "history-fill", "HistoryFill",
-    "Neural network uses 7 previous board positions in addition to the current "
-    "one. During the first moves of the game such historical positions don't "
-    "exist, but they can be synthesized. This parameter defines when to "
-    "synthesize them (always, never, or only at non-standard fen position)."};
-const OptionId SearchParams::kMovesLeftMaxEffectId{
+const OptionId BaseSearchParams::kMovesLeftMaxEffectId{
     "moves-left-max-effect", "MovesLeftMaxEffect",
     "Maximum bonus to add to the score of a node based on how much "
     "shorter/longer it makes the game when winning/losing."};
-const OptionId SearchParams::kMovesLeftThresholdId{
+const OptionId BaseSearchParams::kMovesLeftThresholdId{
     "moves-left-threshold", "MovesLeftThreshold",
     "Absolute value of node Q needs to exceed this value before shorter wins "
     "or longer losses are considered."};
-const OptionId SearchParams::kMovesLeftSlopeId{
+const OptionId BaseSearchParams::kMovesLeftSlopeId{
     "moves-left-slope", "MovesLeftSlope",
     "Controls how the bonus for shorter wins or longer losses is adjusted "
     "based on how many moves the move is estimated to shorten/lengthen the "
     "game. The move difference is multiplied with the slope and capped at "
     "MovesLeftMaxEffect."};
-const OptionId SearchParams::kMovesLeftConstantFactorId{
+const OptionId BaseSearchParams::kMovesLeftConstantFactorId{
     "moves-left-constant-factor", "MovesLeftConstantFactor",
     "A simple multiplier to the moves left effect, can be set to 0 to only use "
     "an effect scaled by Q."};
-const OptionId SearchParams::kMovesLeftScaledFactorId{
+const OptionId BaseSearchParams::kMovesLeftScaledFactorId{
     "moves-left-scaled-factor", "MovesLeftScaledFactor",
     "A factor which is multiplied by the absolute Q of parent node and the "
     "base moves left effect."};
-const OptionId SearchParams::kMovesLeftQuadraticFactorId{
+const OptionId BaseSearchParams::kMovesLeftQuadraticFactorId{
     "moves-left-quadratic-factor", "MovesLeftQuadraticFactor",
     "A factor which is multiplied by the square of Q of parent node and the "
     "base moves left effect."};
-const OptionId SearchParams::kDisplayCacheUsageId{
-    "display-cache-usage", "DisplayCacheUsage",
-    "Display cache fullness through UCI info `hash` section."};
-const OptionId SearchParams::kMaxConcurrentSearchersId{
+const OptionId BaseSearchParams::kMaxConcurrentSearchersId{
     "max-concurrent-searchers", "MaxConcurrentSearchers",
     "If not 0, at most this many search workers can be gathering minibatches "
     "at once."};
-const OptionId SearchParams::kDrawScoreId{
+const OptionId BaseSearchParams::kDrawScoreId{
     "draw-score", "DrawScore",
     "Adjustment of the draw score from white's perspective. Value 0 gives "
     "standard scoring, value -1 gives Armageddon scoring."};
-const OptionId SearchParams::kContemptModeId{
+const OptionId BaseSearchParams::kContemptModeId{
     "contempt-mode", "ContemptMode",
     "Affects the way asymmetric WDL parameters are applied. Default is 'play' "
     "for matches, use 'white_side_analysis' and 'black_side_analysis' for "
     "analysis. Use 'disable' to deactivate contempt."};
-const OptionId SearchParams::kContemptId{
-    "contempt", "Contempt",
-    "The simulated Elo advantage for the WDL conversion. Comma separated "
-    "list in the form [name=]value, where the name is compared with the "
-    "`UCI_Opponent` value to find the appropriate contempt value. The default "
-    "value is taken from `UCI_RatingAdv` and will be overridden if either a "
-    "value without name is given, or if a name match is found."};
-const OptionId SearchParams::kContemptMaxValueId{
-    "contempt-max-value", "ContemptMaxValue",
-    "The maximum value of contempt used. Higher values will be capped."};
-const OptionId SearchParams::kWDLCalibrationEloId{
+const OptionId BaseSearchParams::kContemptId{
+    {.long_flag = "contempt",
+     .uci_option = "Contempt",
+     .help_text = "The simulated Elo advantage for the WDL conversion. Comma "
+                  "separated list in the form [name=]value, where the name is "
+                  "compared with the `UCI_Opponent` value to find the "
+                  "appropriate contempt value. The default value is taken from "
+                  "`UCI_RatingAdv` and will be overridden if either a value "
+                  "without name is given, or if a name match is found.",
+     .visibility = OptionId::kAlwaysVisible}};
+const OptionId BaseSearchParams::kContemptMaxValueId{
+    {.long_flag = "contempt-max-value",
+     .uci_option = "ContemptMaxValue",
+     .help_text =
+         "The maximum value of contempt used. Higher values will be capped.",
+     .visibility = OptionId::kProOnly}};
+const OptionId BaseSearchParams::kWDLCalibrationEloId{
     "wdl-calibration-elo", "WDLCalibrationElo",
-    "Elo of the active side, adjusted for time control relative to rapid."
-    "To retain raw WDL without sharpening/softening, use default value 0."};
-const OptionId SearchParams::kWDLContemptAttenuationId{
-    "wdl-contempt-attenuation", "WDLContemptAttenuation",
-    "Scales how Elo advantage is applied for contempt. Use 1.0 for realistic "
-    "analysis, and 0.5-0.6 for optimal match performance."};
-const OptionId SearchParams::kWDLMaxSId{
-    "wdl-max-s", "WDLMaxS",
-    "Limits the WDL derived sharpness s to a reasonable value to avoid "
-    "erratic behavior at high contempt values. Default recommended for "
-    "regular chess, increase value for more volatile positions like DFRC "
-    "or piece odds."};
-const OptionId SearchParams::kWDLEvalObjectivityId{
+    "Elo of the active side, adjusted for time control relative to rapid.To "
+    "retain raw WDL without sharpening/softening, use default value 0."};
+const OptionId BaseSearchParams::kWDLContemptAttenuationId{
+    {.long_flag = "wdl-contempt-attenuation",
+     .uci_option = "WDLContemptAttenuation",
+     .help_text =
+         "Scales how Elo advantage is applied for contempt. Use 1.0 for "
+         "realistic analysis, and 0.5-0.6 for optimal match performance.",
+     .visibility = OptionId::kProOnly}};
+const OptionId BaseSearchParams::kWDLMaxSId{
+    {.long_flag = "wdl-max-s",
+     .uci_option = "WDLMaxS",
+     .help_text = "Limits the WDL derived sharpness s to a reasonable value to "
+                  "avoid erratic behavior at high contempt values. Default "
+                  "recommended for regular chess, increase value for more "
+                  "volatile positions like DFRC or piece odds.",
+     .visibility = OptionId::kProOnly}};
+const OptionId BaseSearchParams::kWDLEvalObjectivityId{
     "wdl-eval-objectivity", "WDLEvalObjectivity",
-    "When calculating the centipawn eval output, decides how objective/"
-    "contempt influenced the reported eval should be. Value 0.0 reports the "
-    "internally used WDL values, 1.0 attempts an objective eval."};
-const OptionId SearchParams::kWDLDrawRateTargetId{
-    "wdl-draw-rate-target", "WDLDrawRateTarget",
-    "To define the accuracy of play, the target draw rate in equal "
-    "positions is used as a proxy. Ignored if WDLCalibrationElo is set. "
-    "To retain raw WDL without sharpening/softening, use default value 0."};
-const OptionId SearchParams::kWDLDrawRateReferenceId{
+    "When calculating the centipawn eval output, decides how "
+    "objective/contempt influenced the reported eval should be. Value 0.0 "
+    "reports the internally used WDL values, 1.0 attempts an objective eval."};
+const OptionId BaseSearchParams::kWDLDrawRateTargetId{
+    {.long_flag = "wdl-draw-rate-target",
+     .uci_option = "WDLDrawRateTarget",
+     .help_text =
+         "To define the accuracy of play, the target draw rate in equal "
+         "positions is used as a proxy. Ignored if WDLCalibrationElo is set. "
+         "To retain raw WDL without sharpening/softening, use default value 0.",
+     .visibility = OptionId::kProOnly}};
+const OptionId BaseSearchParams::kWDLDrawRateReferenceId{
     "wdl-draw-rate-reference", "WDLDrawRateReference",
-    "Set this to the draw rate predicted by the used neural network at "
-    "default settings. The accuracy rescaling is done relative to the "
-    "reference draw rate."};
-const OptionId SearchParams::kWDLBookExitBiasId{
-    "wdl-book-exit-bias", "WDLBookExitBias",
-    "The book exit bias used when measuring engine Elo. Value of startpos is "
-    "around 0.2, value of 50% white win is 1. Only relevant if target draw "
-    "rate is above 80%; ignored if WDLCalibrationElo is set."};
-const OptionId SearchParams::kNpsLimitId{
+    "Set this to the draw rate predicted by the used neural network at default "
+    "settings. The accuracy rescaling is done relative to the reference draw "
+    "rate."};
+const OptionId BaseSearchParams::kWDLBookExitBiasId{
+    {.long_flag = "wdl-book-exit-bias",
+     .uci_option = "WDLBookExitBias",
+     .help_text =
+         "The book exit bias used when measuring engine Elo. Value of startpos "
+         "is around 0.2, value of 50% white win is 1. Only relevant if target "
+         "draw rate is above 80%; ignored if WDLCalibrationElo is set.",
+     .visibility = OptionId::kProOnly}};
+const OptionId BaseSearchParams::kNpsLimitId{
     "nps-limit", "NodesPerSecondLimit",
     "An option to specify an upper limit to the nodes per second searched. The "
     "accuracy depends on the minibatch size used, increasing for lower sizes, "
     "and on the length of the search. Zero to disable."};
-const OptionId SearchParams::kSolidTreeThresholdId{
-    "solid-tree-threshold", "SolidTreeThreshold",
-    "Only nodes with at least this number of visits will be considered for "
-    "solidification for improved cache locality."};
-const OptionId SearchParams::kTaskWorkersPerSearchWorkerId{
+const OptionId BaseSearchParams::kTaskWorkersPerSearchWorkerId{
     "task-workers", "TaskWorkers",
     "The number of task workers to use to help the search worker. Setting to "
     "-1 will use a heuristic value."};
-const OptionId SearchParams::kMinimumWorkSizeForProcessingId{
+const OptionId BaseSearchParams::kMinimumWorkSizeForProcessingId{
     "minimum-processing-work", "MinimumProcessingWork",
     "This many visits need to be gathered before tasks will be used to "
     "accelerate processing."};
-const OptionId SearchParams::kMinimumWorkSizeForPickingId{
+const OptionId BaseSearchParams::kMinimumWorkSizeForPickingId{
     "minimum-picking-work", "MinimumPickingWork",
     "Search branches with more than this many collisions/visits may be split "
     "off to task workers."};
-const OptionId SearchParams::kMinimumRemainingWorkSizeForPickingId{
+const OptionId BaseSearchParams::kMinimumRemainingWorkSizeForPickingId{
     "minimum-remaining-picking-work", "MinimumRemainingPickingWork",
     "Search branches won't be split off to task workers unless there is at "
     "least this much work left to do afterwards."};
-const OptionId SearchParams::kMinimumWorkPerTaskForProcessingId{
+const OptionId BaseSearchParams::kMinimumWorkPerTaskForProcessingId{
     "minimum-per-task-processing", "MinimumPerTaskProcessing",
     "Processing work won't be split into chunks smaller than this (unless its "
     "more than half of MinimumProcessingWork)."};
-const OptionId SearchParams::kIdlingMinimumWorkId{
+const OptionId BaseSearchParams::kIdlingMinimumWorkId{
     "idling-minimum-work", "IdlingMinimumWork",
     "Only early exit gathering due to 'idle' backend if more than this many "
     "nodes will be sent to the backend."};
-const OptionId SearchParams::kThreadIdlingThresholdId{
+const OptionId BaseSearchParams::kThreadIdlingThresholdId{
     "thread-idling-threshold", "ThreadIdlingThreshold",
     "If there are more than this number of search threads that are not "
     "actively in the process of either sending data to the backend or waiting "
     "for data from the backend, assume that the backend is idle."};
-const OptionId SearchParams::kMaxCollisionVisitsScalingStartId{
+const OptionId BaseSearchParams::kMaxCollisionVisitsScalingStartId{
     "max-collision-visits-scaling-start", "MaxCollisionVisitsScalingStart",
     "Tree size where max collision visits starts scaling up from 1."};
-const OptionId SearchParams::kMaxCollisionVisitsScalingEndId{
+const OptionId BaseSearchParams::kMaxCollisionVisitsScalingEndId{
     "max-collision-visits-scaling-end", "MaxCollisionVisitsScalingEnd",
     "Tree size where max collision visits reaches max. Set to 0 to disable "
     "scaling entirely."};
-const OptionId SearchParams::kMaxCollisionVisitsScalingPowerId{
+const OptionId BaseSearchParams::kMaxCollisionVisitsScalingPowerId{
     "max-collision-visits-scaling-power", "MaxCollisionVisitsScalingPower",
     "Power to apply to the interpolation between 1 and max to make it curved."};
-const OptionId SearchParams::kUCIOpponentId{
+const OptionId BaseSearchParams::kUCIOpponentId{
     "", "UCI_Opponent",
     "UCI option used by the GUI to pass the name and other information about "
     "the current opponent."};
-const OptionId SearchParams::kUCIRatingAdvId{
+const OptionId BaseSearchParams::kUCIRatingAdvId{
     "", "UCI_RatingAdv",
     "UCI extension used by some GUIs to pass the estimated Elo advantage over "
     "the current opponent, used as the default contempt value."};
-const OptionId SearchParams::kSearchSpinBackoffId{
+const OptionId BaseSearchParams::kSearchSpinBackoffId{
     "search-spin-backoff", "SearchSpinBackoff",
     "Enable backoff for the spin lock that acquires available searcher."};
+const OptionId BaseSearchParams::kGarbageCollectionDelayId{
+    "garbage-collection-delay", "GarbageCollectionDelay",
+    "The percentage of expected move time until garbage collection start. "
+    "Delay lets search find transpositions to freed search tree branches."};
 
-void SearchParams::Populate(OptionsParser* options) {
+const OptionId SearchParams::kMaxPrefetchBatchId{
+    "max-prefetch", "MaxPrefetch",
+    "When the engine cannot gather a large enough batch for immediate use, try "
+    "to prefetch up to X positions which are likely to be useful soon, and put "
+    "them into cache."};
+const OptionId SearchParams::kSolidTreeThresholdId{
+    "solid-tree-threshold", "SolidTreeThreshold",
+    "Only nodes with at least this number of visits will be considered for "
+    "solidification for improved cache locality."};
+
+void BaseSearchParams::Populate(OptionsParser* options) {
   // Here the uci optimized defaults" are set.
   // Many of them are overridden with training specific values in tournament.cc.
   options->Add<IntOption>(kMiniBatchSizeId, 0, 1024) = 0;
-  options->Add<IntOption>(kMaxPrefetchBatchId, 0, 1024) = DEFAULT_MAX_PREFETCH;
   options->Add<FloatOption>(kCpuctId, 0.0f, 100.0f) = 1.745f;
   options->Add<FloatOption>(kCpuctAtRootId, 0.0f, 100.0f) = 1.745f;
   options->Add<FloatOption>(kCpuctBaseId, 1.0f, 1000000000.0f) = 38739.0f;
@@ -521,7 +571,6 @@ void SearchParams::Populate(OptionsParser* options) {
   options->Add<ChoiceOption>(kFpuStrategyAtRootId, fpu_strategy) = "same";
   options->Add<FloatOption>(kFpuValueAtRootId, -100.0f, 100.0f) = 1.0f;
   options->Add<IntOption>(kCacheHistoryLengthId, 0, 7) = 0;
-  options->Add<FloatOption>(kPolicySoftmaxTempId, 0.1f, 10.0f) = 1.359f;
   options->Add<IntOption>(kMaxCollisionEventsId, 1, 65536) = 917;
   options->Add<IntOption>(kMaxCollisionVisitsId, 1, 100000000) = 80000;
   options->Add<IntOption>(kMaxCollisionVisitsScalingStartId, 1, 100000) = 28;
@@ -545,7 +594,6 @@ void SearchParams::Populate(OptionsParser* options) {
                                          "WDL_mu"};
   options->Add<ChoiceOption>(kScoreTypeId, score_type) = "WDL_mu";
   std::vector<std::string> history_fill_opt{"no", "fen_only", "always"};
-  options->Add<ChoiceOption>(kHistoryFillId, history_fill_opt) = "fen_only";
   options->Add<FloatOption>(kMovesLeftMaxEffectId, 0.0f, 1.0f) = 0.0345f;
   options->Add<FloatOption>(kMovesLeftThresholdId, 0.0f, 1.0f) = 0.8f;
   options->Add<FloatOption>(kMovesLeftSlopeId, 0.0f, 1.0f) = 0.0027f;
@@ -553,7 +601,6 @@ void SearchParams::Populate(OptionsParser* options) {
   options->Add<FloatOption>(kMovesLeftScaledFactorId, -2.0f, 2.0f) = 1.6521f;
   options->Add<FloatOption>(kMovesLeftQuadraticFactorId, -1.0f, 1.0f) =
       -0.6521f;
-  options->Add<BoolOption>(kDisplayCacheUsageId) = false;
   options->Add<IntOption>(kMaxConcurrentSearchersId, 0, 128) = 1;
   options->Add<FloatOption>(kDrawScoreId, -1.0f, 1.0f) = 0.0f;
   std::vector<std::string> mode = {"play", "white_side_analysis",
@@ -572,7 +619,6 @@ void SearchParams::Populate(OptionsParser* options) {
   options->Add<FloatOption>(kWDLDrawRateReferenceId, 0.001f, 0.999f) = 0.5f;
   options->Add<FloatOption>(kWDLBookExitBiasId, -2.0f, 2.0f) = 0.65f;
   options->Add<FloatOption>(kNpsLimitId, 0.0f, 1e6f) = 0.0f;
-  options->Add<IntOption>(kSolidTreeThresholdId, 1, 2000000000) = 100;
   options->Add<IntOption>(kTaskWorkersPerSearchWorkerId, -1, 128) = -1;
   options->Add<IntOption>(kMinimumWorkSizeForProcessingId, 2, 100000) = 20;
   options->Add<IntOption>(kMinimumWorkSizeForPickingId, 1, 100000) = 1;
@@ -584,32 +630,16 @@ void SearchParams::Populate(OptionsParser* options) {
   options->Add<StringOption>(kUCIOpponentId);
   options->Add<FloatOption>(kUCIRatingAdvId, -10000.0f, 10000.0f) = 0.0f;
   options->Add<BoolOption>(kSearchSpinBackoffId) = false;
+  options->Add<FloatOption>(kGarbageCollectionDelayId, 0.0f, 100.0f) = 10.0f;
+}
 
-  options->HideOption(kNoiseEpsilonId);
-  options->HideOption(kNoiseAlphaId);
-  options->HideOption(kLogLiveStatsId);
-  options->HideOption(kDisplayCacheUsageId);
-  options->HideOption(kRootHasOwnCpuctParamsId);
-  options->HideOption(kCpuctAtRootId);
-  options->HideOption(kCpuctBaseAtRootId);
-  options->HideOption(kCpuctFactorAtRootId);
-  options->HideOption(kFpuStrategyAtRootId);
-  options->HideOption(kFpuValueAtRootId);
-  options->HideOption(kTemperatureId);
-  options->HideOption(kTempDecayMovesId);
-  options->HideOption(kTempDecayDelayMovesId);
-  options->HideOption(kTemperatureCutoffMoveId);
-  options->HideOption(kTemperatureEndgameId);
-  options->HideOption(kTemperatureWinpctCutoffId);
-  options->HideOption(kTemperatureVisitOffsetId);
-  options->HideOption(kContemptMaxValueId);
-  options->HideOption(kWDLContemptAttenuationId);
-  options->HideOption(kWDLMaxSId);
-  options->HideOption(kWDLDrawRateTargetId);
-  options->HideOption(kWDLBookExitBiasId);
+void SearchParams::Populate(OptionsParser* options) {
+  BaseSearchParams::Populate(options);
+  options->Add<IntOption>(kMaxPrefetchBatchId, 0, 1024) = DEFAULT_MAX_PREFETCH;
+  options->Add<IntOption>(kSolidTreeThresholdId, 1, 2000000000) = 100;
 }
 
-SearchParams::SearchParams(const OptionsDict& options)
+BaseSearchParams::BaseSearchParams(const OptionsDict& options)
     : options_(options),
       kCpuct(options.Get<float>(kCpuctId)),
       kCpuctAtRoot(options.Get<float>(
@@ -636,13 +666,15 @@ SearchParams::SearchParams(const OptionsDict& options)
                           ? kFpuValue
                           : options.Get<float>(kFpuValueAtRootId)),
       kCacheHistoryLength(options.Get<int>(kCacheHistoryLengthId)),
-      kPolicySoftmaxTemp(options.Get<float>(kPolicySoftmaxTempId)),
+      kPolicySoftmaxTemp(
+          options.Get<float>(SharedBackendParams::kPolicySoftmaxTemp)),
       kMaxCollisionEvents(options.Get<int>(kMaxCollisionEventsId)),
       kMaxCollisionVisits(options.Get<int>(kMaxCollisionVisitsId)),
       kOutOfOrderEval(options.Get<bool>(kOutOfOrderEvalId)),
       kStickyEndgames(options.Get<bool>(kStickyEndgamesId)),
       kSyzygyFastPlay(options.Get<bool>(kSyzygyFastPlayId)),
-      kHistoryFill(EncodeHistoryFill(options.Get<std::string>(kHistoryFillId))),
+      kHistoryFill(EncodeHistoryFill(
+          options.Get<std::string>(SharedBackendParams::kHistoryFill))),
       kMiniBatchSize(options.Get<int>(kMiniBatchSizeId)),
       kMovesLeftMaxEffect(options.Get<float>(kMovesLeftMaxEffectId)),
       kMovesLeftThreshold(options.Get<float>(kMovesLeftThresholdId)),
@@ -651,7 +683,6 @@ SearchParams::SearchParams(const OptionsDict& options)
       kMovesLeftScaledFactor(options.Get<float>(kMovesLeftScaledFactorId)),
       kMovesLeftQuadraticFactor(
           options.Get<float>(kMovesLeftQuadraticFactorId)),
-      kDisplayCacheUsage(options.Get<bool>(kDisplayCacheUsageId)),
       kMaxConcurrentSearchers(options.Get<int>(kMaxConcurrentSearchersId)),
       kDrawScore(options.Get<float>(kDrawScoreId)),
       kContempt(GetContempt(options.Get<std::string>(kUCIOpponentId),
@@ -675,7 +706,6 @@ SearchParams::SearchParams(const OptionsDict& options)
       kMaxOutOfOrderEvalsFactor(
           options.Get<float>(kMaxOutOfOrderEvalsFactorId)),
       kNpsLimit(options.Get<float>(kNpsLimitId)),
-      kSolidTreeThreshold(options.Get<int>(kSolidTreeThresholdId)),
       kTaskWorkersPerSearchWorker(
           options.Get<int>(kTaskWorkersPerSearchWorkerId)),
       kMinimumWorkSizeForProcessing(
@@ -694,6 +724,11 @@ SearchParams::SearchParams(const OptionsDict& options)
           options.Get<int>(kMaxCollisionVisitsScalingEndId)),
       kMaxCollisionVisitsScalingPower(
           options.Get<float>(kMaxCollisionVisitsScalingPowerId)),
-      kSearchSpinBackoff(options_.Get<bool>(kSearchSpinBackoffId)) {}
+      kSearchSpinBackoff(options_.Get<bool>(kSearchSpinBackoffId)),
+      kGarbageCollectionDelay(options_.Get<float>(kGarbageCollectionDelayId)) {}
 
+SearchParams::SearchParams(const OptionsDict& options)
+    : BaseSearchParams(options),
+      kSolidTreeThreshold(options.Get<int>(kSolidTreeThresholdId)) {}
+}  // namespace classic
 }  // namespace lczero
diff --git a/src/mcts/params.h b/src/search/classic/params.h
similarity index 94%
rename from src/mcts/params.h
rename to src/search/classic/params.h
index df02f124fb..d84dbad5d8 100644
--- a/src/mcts/params.h
+++ b/src/search/classic/params.h
@@ -1,6 +1,6 @@
 /*
   This file is part of Leela Chess Zero.
-  Copyright (C) 2018-2019 The LCZero Authors
+  Copyright (C) 2018-2025 The LCZero Authors
 
   Leela Chess is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -32,13 +32,14 @@
 #include "utils/optionsparser.h"
 
 namespace lczero {
+namespace classic {
 
 enum class ContemptMode { PLAY, WHITE, BLACK, NONE };
 
-class SearchParams {
+class BaseSearchParams {
  public:
-  SearchParams(const OptionsDict& options);
-  SearchParams(const SearchParams&) = delete;
+  BaseSearchParams(const OptionsDict& options);
+  BaseSearchParams(const BaseSearchParams&) = delete;
 
   // Use struct for WDLRescaleParams calculation to make them const.
   struct WDLRescaleParams {
@@ -55,9 +56,6 @@ class SearchParams {
 
   // Parameter getters.
   int GetMiniBatchSize() const { return kMiniBatchSize; }
-  int GetMaxPrefetchBatch() const {
-    return options_.Get<int>(kMaxPrefetchBatchId);
-  }
   float GetCpuct(bool at_root) const { return at_root ? kCpuctAtRoot : kCpuct; }
   float GetCpuctBase(bool at_root) const {
     return at_root ? kCpuctBaseAtRoot : kCpuctBase;
@@ -114,7 +112,6 @@ class SearchParams {
   float GetMovesLeftQuadraticFactor() const {
     return kMovesLeftQuadraticFactor;
   }
-  bool GetDisplayCacheUsage() const { return kDisplayCacheUsage; }
   int GetMaxConcurrentSearchers() const { return kMaxConcurrentSearchers; }
   float GetDrawScore() const { return kDrawScore; }
   ContemptMode GetContemptMode() const {
@@ -133,7 +130,6 @@ class SearchParams {
     return kMaxOutOfOrderEvalsFactor;
   }
   float GetNpsLimit() const { return kNpsLimit; }
-  int GetSolidTreeThreshold() const { return kSolidTreeThreshold; }
 
   int GetTaskWorkersPerSearchWorker() const {
     return kTaskWorkersPerSearchWorker;
@@ -163,9 +159,12 @@ class SearchParams {
   }
   bool GetSearchSpinBackoff() const { return kSearchSpinBackoff; }
 
+  float GetGarbageCollectionDelay() const {
+    return kGarbageCollectionDelay;
+  }
+
   // Search parameter IDs.
   static const OptionId kMiniBatchSizeId;
-  static const OptionId kMaxPrefetchBatchId;
   static const OptionId kCpuctId;
   static const OptionId kCpuctAtRootId;
   static const OptionId kCpuctBaseId;
@@ -190,7 +189,6 @@ class SearchParams {
   static const OptionId kFpuStrategyAtRootId;
   static const OptionId kFpuValueAtRootId;
   static const OptionId kCacheHistoryLengthId;
-  static const OptionId kPolicySoftmaxTempId;
   static const OptionId kMaxCollisionEventsId;
   static const OptionId kMaxCollisionVisitsId;
   static const OptionId kOutOfOrderEvalId;
@@ -199,14 +197,12 @@ class SearchParams {
   static const OptionId kMultiPvId;
   static const OptionId kPerPvCountersId;
   static const OptionId kScoreTypeId;
-  static const OptionId kHistoryFillId;
   static const OptionId kMovesLeftMaxEffectId;
   static const OptionId kMovesLeftThresholdId;
   static const OptionId kMovesLeftConstantFactorId;
   static const OptionId kMovesLeftScaledFactorId;
   static const OptionId kMovesLeftQuadraticFactorId;
   static const OptionId kMovesLeftSlopeId;
-  static const OptionId kDisplayCacheUsageId;
   static const OptionId kMaxConcurrentSearchersId;
   static const OptionId kDrawScoreId;
   static const OptionId kContemptModeId;
@@ -221,7 +217,6 @@ class SearchParams {
   static const OptionId kWDLBookExitBiasId;
   static const OptionId kMaxOutOfOrderEvalsFactorId;
   static const OptionId kNpsLimitId;
-  static const OptionId kSolidTreeThresholdId;
   static const OptionId kTaskWorkersPerSearchWorkerId;
   static const OptionId kMinimumWorkSizeForProcessingId;
   static const OptionId kMinimumWorkSizeForPickingId;
@@ -235,9 +230,12 @@ class SearchParams {
   static const OptionId kUCIOpponentId;
   static const OptionId kUCIRatingAdvId;
   static const OptionId kSearchSpinBackoffId;
+  static const OptionId kGarbageCollectionDelayId;
 
- private:
+ protected:
   const OptionsDict& options_;
+
+ private:
   // Cached parameter values. Values have to be cached if either:
   // 1. Parameter is accessed often and has to be cached for performance
   // reasons.
@@ -272,7 +270,6 @@ class SearchParams {
   const float kMovesLeftConstantFactor;
   const float kMovesLeftScaledFactor;
   const float kMovesLeftQuadraticFactor;
-  const bool kDisplayCacheUsage;
   const int kMaxConcurrentSearchers;
   const float kDrawScore;
   const float kContempt;
@@ -281,7 +278,6 @@ class SearchParams {
   const float kWDLEvalObjectivity;
   const float kMaxOutOfOrderEvalsFactor;
   const float kNpsLimit;
-  const int kSolidTreeThreshold;
   const int kTaskWorkersPerSearchWorker;
   const int kMinimumWorkSizeForProcessing;
   const int kMinimumWorkSizeForPicking;
@@ -293,6 +289,29 @@ class SearchParams {
   const int kMaxCollisionVisitsScalingEnd;
   const float kMaxCollisionVisitsScalingPower;
   const bool kSearchSpinBackoff;
+  const float kGarbageCollectionDelay;
 };
 
+class SearchParams : public BaseSearchParams {
+ public:
+  SearchParams(const OptionsDict& options);
+  SearchParams(const SearchParams&) = delete;
+
+  // Populates UciOptions with search parameters.
+  static void Populate(OptionsParser* options);
+
+  // Parameter getters.
+  int GetMaxPrefetchBatch() const {
+    return options_.Get<int>(kMaxPrefetchBatchId);
+  }
+  int GetSolidTreeThreshold() const { return kSolidTreeThreshold; }
+
+  // Search parameter IDs.
+  static const OptionId kMaxPrefetchBatchId;
+  static const OptionId kSolidTreeThresholdId;
+
+ private:
+  const int kSolidTreeThreshold;
+};
+}  // namespace classic
 }  // namespace lczero
diff --git a/src/mcts/search.cc b/src/search/classic/search.cc
similarity index 91%
rename from src/mcts/search.cc
rename to src/search/classic/search.cc
index b3326a7661..7a4d1c7deb 100644
--- a/src/mcts/search.cc
+++ b/src/search/classic/search.cc
@@ -25,11 +25,10 @@
   Program grant you additional permission to convey the resulting work.
 */
 
-#include "mcts/search.h"
+#include "search/classic/search.h"
 
 #include <algorithm>
 #include <array>
-#include <chrono>
 #include <cmath>
 #include <iomanip>
 #include <iostream>
@@ -37,14 +36,15 @@
 #include <sstream>
 #include <thread>
 
-#include "mcts/node.h"
-#include "neural/cache.h"
 #include "neural/encoder.h"
+#include "search/classic/node.h"
 #include "utils/fastmath.h"
 #include "utils/random.h"
 #include "utils/spinhelper.h"
+#include "utils/trace.h"
 
 namespace lczero {
+namespace classic {
 
 namespace {
 // Maximum delay between outputting "uci info" when nothing interesting happens.
@@ -149,20 +149,20 @@ class MEvaluator {
 
 }  // namespace
 
-Search::Search(const NodeTree& tree, Network* network,
+Search::Search(const NodeTree& tree, Backend* backend,
                std::unique_ptr<UciResponder> uci_responder,
                const MoveList& searchmoves,
                std::chrono::steady_clock::time_point start_time,
                std::unique_ptr<SearchStopper> stopper, bool infinite,
-               bool ponder, const OptionsDict& options, NNCache* cache,
+               bool ponder, const OptionsDict& options,
                SyzygyTablebase* syzygy_tb)
     : ok_to_respond_bestmove_(!infinite && !ponder),
       stopper_(std::move(stopper)),
       root_node_(tree.GetCurrentHead()),
-      cache_(cache),
       syzygy_tb_(syzygy_tb),
       played_history_(tree.GetPositionHistory()),
-      network_(network),
+      backend_(backend),
+      backend_attributes_(backend->GetAttributes()),
       params_(options),
       searchmoves_(searchmoves),
       start_time_(start_time),
@@ -263,7 +263,6 @@ void Search::SendUciInfo() REQUIRES(nodes_mutex_) REQUIRES(counters_mutex_) {
   const auto edges = GetBestChildrenNoTemperature(root_node_, max_pv, 0);
   const auto score_type = params_.GetScoreType();
   const auto per_pv_counters = params_.GetPerPvCounters();
-  const auto display_cache_usage = params_.GetDisplayCacheUsage();
   const auto draw_score = GetDrawScore(false);
 
   std::vector<ThinkingInfo> uci_infos;
@@ -276,10 +275,6 @@ void Search::SendUciInfo() REQUIRES(nodes_mutex_) REQUIRES(counters_mutex_) {
   if (!per_pv_counters) {
     common_info.nodes = total_playouts_ + initial_visits_;
   }
-  if (display_cache_usage) {
-    common_info.hashfull =
-        cache_->GetSize() * 1000LL / std::max(cache_->GetCapacity(), 1);
-  }
   if (nps_start_time_) {
     const auto time_since_first_batch_ms =
         std::chrono::duration_cast<std::chrono::milliseconds>(
@@ -287,6 +282,7 @@ void Search::SendUciInfo() REQUIRES(nodes_mutex_) REQUIRES(counters_mutex_) {
             .count();
     if (time_since_first_batch_ms > 0) {
       common_info.nps = total_playouts_ * 1000 / time_since_first_batch_ms;
+      common_info.eps = network_evaluations_ * 1000 / time_since_first_batch_ms;
     }
   }
   common_info.tb_hits = tb_hits_.load(std::memory_order_acquire);
@@ -340,7 +336,7 @@ void Search::SendUciInfo() REQUIRES(nodes_mutex_) REQUIRES(counters_mutex_) {
       const float centipawn_fallback_threshold = 0.996f;
       float centipawn_score = 45 * tan(1.56728071628 * wl);
       uci_info.score =
-          network_->GetCapabilities().has_wdl() && mu_uci != 0.0f &&
+          backend_attributes_.has_wdl && mu_uci != 0.0f &&
                   std::abs(wl) + d < centipawn_fallback_threshold &&
                   (std::abs(mu_uci) < 1.0f ||
                    std::abs(centipawn_score) < std::abs(100 * mu_uci))
@@ -360,7 +356,7 @@ void Search::SendUciInfo() REQUIRES(nodes_mutex_) REQUIRES(counters_mutex_) {
       wdl_d = 0;
     }
     uci_info.wdl = ThinkingInfo::WDL{wdl_w, wdl_d, wdl_l};
-    if (network_->GetCapabilities().has_mlh()) {
+    if (backend_attributes_.has_mlh) {
       uci_info.moves_left = static_cast<int>(
           (1.0f + edge.GetM(1.0f + root_node_->GetM())) / 2.0f);
     }
@@ -431,7 +427,7 @@ float Search::GetDrawScore(bool is_odd_depth) const {
 }
 
 namespace {
-inline float GetFpu(const SearchParams& params, Node* node, bool is_root_node,
+inline float GetFpu(const SearchParams& params, const Node* node, bool is_root_node,
                     float draw_score) {
   const auto value = params.GetFpuValue(is_root_node);
   return params.GetFpuAbsolute(is_root_node)
@@ -441,7 +437,7 @@ inline float GetFpu(const SearchParams& params, Node* node, bool is_root_node,
 }
 
 // Faster version for if visited_policy is readily available already.
-inline float GetFpu(const SearchParams& params, Node* node, bool is_root_node,
+inline float GetFpu(const SearchParams& params, const Node* node, bool is_root_node,
                     float draw_score, float visited_pol) {
   const auto value = params.GetFpuValue(is_root_node);
   return params.GetFpuAbsolute(is_root_node)
@@ -458,7 +454,10 @@ inline float ComputeCpuct(const SearchParams& params, uint32_t N,
 }
 }  // namespace
 
-std::vector<std::string> Search::GetVerboseStats(Node* node) const {
+// Ignore the last tuple element when sorting in GetVerboseStats
+static bool operator<(const EdgeAndNode&, const EdgeAndNode&) { return false; }
+
+std::vector<std::string> Search::GetVerboseStats(const Node* node) const {
   assert(node == root_node_ || node->GetParent() == root_node_);
   const bool is_root = (node == root_node_);
   const bool is_odd_depth = !is_root;
@@ -468,16 +467,14 @@ std::vector<std::string> Search::GetVerboseStats(Node* node) const {
   const float cpuct = ComputeCpuct(params_, node->GetN(), is_root);
   const float U_coeff =
       cpuct * std::sqrt(std::max(node->GetChildrenVisits(), 1u));
-  std::vector<EdgeAndNode> edges;
-  for (const auto& edge : node->Edges()) edges.push_back(edge);
-
-  std::sort(edges.begin(), edges.end(),
-            [&fpu, &U_coeff, &draw_score](EdgeAndNode a, EdgeAndNode b) {
-              return std::forward_as_tuple(
-                         a.GetN(), a.GetQ(fpu, draw_score) + a.GetU(U_coeff)) <
-                     std::forward_as_tuple(
-                         b.GetN(), b.GetQ(fpu, draw_score) + b.GetU(U_coeff));
-            });
+  std::vector<std::tuple<uint32_t, float, EdgeAndNode>> edges;
+  edges.reserve(node->GetNumEdges());
+  for (const auto& edge : node->Edges()) {
+    edges.emplace_back(edge.GetN(),
+                       edge.GetQ(fpu, draw_score) + edge.GetU(U_coeff),
+                       edge);
+  }
+  std::sort(edges.begin(), edges.end());
 
   auto print = [](auto* oss, auto pre, auto v, auto post, auto w, int p = 0) {
     *oss << pre << std::setw(w) << std::setprecision(p) << v << post;
@@ -520,8 +517,10 @@ std::vector<std::string> Search::GetVerboseStats(Node* node) const {
     std::optional<float> v;
     if (n && n->IsTerminal()) {
       v = n->GetQ(sign * draw_score);
-    } else {
-      NNCacheLock nneval = GetCachedNNEval(n);
+    } else if (n) {
+      auto history = GetPositionHistoryAtNode(n);
+      std::optional<EvalResult> nneval = backend_->GetCachedEvaluation(
+          EvalPosition{history.GetPositions(), {}});
       if (nneval) v = -nneval->q;
     }
     if (v) {
@@ -545,18 +544,18 @@ std::vector<std::string> Search::GetVerboseStats(Node* node) const {
   };
 
   std::vector<std::string> infos;
-  const auto m_evaluator = network_->GetCapabilities().has_mlh()
-                               ? MEvaluator(params_, node)
-                               : MEvaluator();
-  for (const auto& edge : edges) {
+  const auto m_evaluator =
+      backend_attributes_.has_mlh ? MEvaluator(params_, node) : MEvaluator();
+  for (const auto& edge_tuple : edges) {
+    const auto& edge = std::get<2>(edge_tuple);
     float Q = edge.GetQ(fpu, draw_score);
     float M = m_evaluator.GetMUtility(edge, Q);
     std::ostringstream oss;
     oss << std::left;
     // TODO: should this be displaying transformed index?
-    print_head(&oss, edge.GetMove(is_black_to_move).as_string(),
-               edge.GetMove().as_nn_index(0), edge.GetN(), edge.GetNInFlight(),
-               edge.GetP());
+    print_head(&oss, edge.GetMove(is_black_to_move).ToString(true),
+               MoveToNNIndex(edge.GetMove(), 0), edge.GetN(),
+               edge.GetNInFlight(), edge.GetP());
     print_stats(&oss, edge.node());
     print(&oss, "(U: ", edge.GetU(U_coeff), ") ", 6, 5);
     print(&oss, "(S: ", Q + edge.GetU(U_coeff) + M, ") ", 8, 5);
@@ -595,7 +594,7 @@ void Search::SendMovesStats() const REQUIRES(counters_mutex_) {
       continue;
     }
     if (edge.HasNode()) {
-      LOGFILE << "--- Opponent moves after: " << final_bestmove_.as_string();
+      LOGFILE << "--- Opponent moves after: " << final_bestmove_.ToString(true);
       for (const auto& line : GetVerboseStats(edge.node())) {
         LOGFILE << line;
       }
@@ -603,20 +602,16 @@ void Search::SendMovesStats() const REQUIRES(counters_mutex_) {
   }
 }
 
-NNCacheLock Search::GetCachedNNEval(const Node* node) const {
-  if (!node) return {};
-
-  std::vector<Move> moves;
-  for (; node != root_node_; node = node->GetParent()) {
-    moves.push_back(node->GetOwnEdge()->GetMove());
-  }
+PositionHistory Search::GetPositionHistoryAtNode(const Node* node) const {
   PositionHistory history(played_history_);
-  for (auto iter = moves.rbegin(), end = moves.rend(); iter != end; ++iter) {
-    history.Append(*iter);
+  std::vector<Move> rmoves;
+  for (const Node* n = node; n != root_node_; n = n->GetParent()) {
+    rmoves.push_back(n->GetOwnEdge()->GetMove());
+  }
+  for (auto it = rmoves.rbegin(); it != rmoves.rend(); it++) {
+    history.Append(*it);
   }
-  const auto hash = history.HashLast(params_.GetCacheHistoryLength() + 1);
-  NNCacheLock nneval(cache_, hash);
-  return nneval;
+  return history;
 }
 
 void Search::MaybeTriggerStop(const IterationStats& stats,
@@ -630,7 +625,7 @@ void Search::MaybeTriggerStop(const IterationStats& stats,
   // Already responded bestmove, nothing to do here.
   if (bestmove_is_sent_) return;
   // Don't stop when the root node is not yet expanded.
-  if (total_playouts_ + initial_visits_ == 0) return;
+  if (stats.total_nodes == 0) return;
 
   if (!stop_.load(std::memory_order_acquire)) {
     if (stopper_->ShouldStop(stats, hints)) FireStopInternal();
@@ -901,7 +896,8 @@ EdgeAndNode Search::GetBestRootChildWithTemperature(float temperature) const {
 void Search::StartThreads(size_t how_many) {
   Mutex::Lock lock(threads_mutex_);
   if (how_many == 0 && threads_.size() == 0) {
-    how_many = network_->GetThreads() + !network_->IsCpu();
+    how_many = backend_attributes_.suggested_num_search_threads +
+               !backend_attributes_.runs_on_cpu;
   }
   thread_count_.store(how_many, std::memory_order_release);
   // First thread is a watchdog thread.
@@ -910,8 +906,8 @@ void Search::StartThreads(size_t how_many) {
   }
   // Start working threads.
   for (size_t i = 0; i < how_many; i++) {
-    threads_.emplace_back([this, i]() {
-      SearchWorker worker(this, params_, i);
+    threads_.emplace_back([this]() {
+      SearchWorker worker(this, params_);
       worker.RunBlocking();
     });
   }
@@ -961,7 +957,7 @@ void Search::PopulateCommonIterationStats(IterationStats* stats) {
     float max_q_plus_m = -1000;
     uint64_t max_n = 0;
     bool max_n_has_max_q_plus_m = true;
-    const auto m_evaluator = network_->GetCapabilities().has_mlh()
+    const auto m_evaluator = backend_attributes_.has_mlh
                                  ? MEvaluator(params_, root_node_)
                                  : MEvaluator();
     for (const auto& edge : root_node_->Edges()) {
@@ -1111,7 +1107,7 @@ void SearchWorker::RunTasks(int tid) {
             // We got the spin lock, double check we're still in the clear.
             if (nta < tc) {
               id = tasks_taken_.fetch_add(1, std::memory_order_acq_rel);
-              task = &picking_tasks_[id];
+              task = picking_tasks_.data() + id;
               task_taking_started_.store(0, std::memory_order_release);
               break;
             }
@@ -1159,7 +1155,7 @@ void SearchWorker::RunTasks(int tid) {
           break;
         }
       }
-      picking_tasks_[id].complete = true;
+      picking_tasks_.data()[id].complete = true;
       completed_tasks_.fetch_add(1, std::memory_order_acq_rel);
     }
   }
@@ -1167,7 +1163,7 @@ void SearchWorker::RunTasks(int tid) {
 
 void SearchWorker::ExecuteOneIteration() {
   // 1. Initialize internal structures.
-  InitializeIteration(search_->network_->NewComputation());
+  InitializeIteration();
 
   if (params_.GetMaxConcurrentSearchers() != 0) {
     std::unique_ptr<SpinHelper> spin_helper;
@@ -1256,11 +1252,12 @@ void SearchWorker::ExecuteOneIteration() {
 
 // 1. Initialize internal structures.
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-void SearchWorker::InitializeIteration(
-    std::unique_ptr<NetworkComputation> computation) {
-  computation_ = std::make_unique<CachingComputation>(std::move(computation),
-                                                      search_->cache_);
-  computation_->Reserve(target_minibatch_size_);
+void SearchWorker::InitializeIteration() {
+  LCTRACE_FUNCTION_SCOPE;
+  // Free the old computation before allocating a new one. This works better
+  // when backend caches buffer allocations between computations.
+  computation_.reset();
+  computation_ = search_->backend_->CreateComputation();
   minibatch_.clear();
   minibatch_.reserve(2 * target_minibatch_size_);
 }
@@ -1291,6 +1288,7 @@ int CalculateCollisionsLeft(int64_t nodes, const SearchParams& params) {
 }  // namespace
 
 void SearchWorker::GatherMinibatch() {
+  LCTRACE_FUNCTION_SCOPE;
   // Total number of nodes to process.
   int minibatch_size = 0;
   int cur_n = 0;
@@ -1316,7 +1314,7 @@ void SearchWorker::GatherMinibatch() {
   while (minibatch_size < target_minibatch_size_ &&
          number_out_of_order_ < max_out_of_order_) {
     // If there's something to process without touching slow neural net, do it.
-    if (minibatch_size > 0 && computation_->GetCacheMisses() == 0) return;
+    if (minibatch_size > 0 && computation_->UsedBatchSize() == 0) return;
 
     // If there is backend work to be done, and the backend is idle - exit
     // immediately.
@@ -1325,7 +1323,8 @@ void SearchWorker::GatherMinibatch() {
     // be keeping the backend busy. Which would mean that threads=1 has a
     // massive nps drop.
     if (thread_count > 1 && minibatch_size > 0 &&
-        computation_->GetCacheMisses() > params_.GetIdlingMinimumWork() &&
+        static_cast<int>(computation_->UsedBatchSize()) >
+            params_.GetIdlingMinimumWork() &&
         thread_count - search_->backend_waiting_counter_.load(
                            std::memory_order_relaxed) >
             params_.GetThreadIdlingThreshold()) {
@@ -1391,6 +1390,7 @@ void SearchWorker::GatherMinibatch() {
       }
     }
     if (some_ooo) {
+      LCTRACE_FUNCTION_SCOPE;
       SharedMutex::Lock lock(search_->nodes_mutex_);
       for (int i = static_cast<int>(minibatch_.size()) - 1; i >= new_start;
            i--) {
@@ -1414,22 +1414,8 @@ void SearchWorker::GatherMinibatch() {
         }
       }
     }
-    for (size_t i = new_start; i < minibatch_.size(); i++) {
-      // If there was no OOO, there can stil be collisions.
-      // There are no OOO though.
-      // Also terminals when OOO is disabled.
-      if (!minibatch_[i].nn_queried) continue;
-      if (minibatch_[i].is_cache_hit) {
-        // Since minibatch_[i] holds cache lock, this is guaranteed to succeed.
-        computation_->AddInputByHash(minibatch_[i].hash,
-                                     std::move(minibatch_[i].lock));
-      } else {
-        computation_->AddInput(minibatch_[i].hash,
-                               std::move(minibatch_[i].input_planes),
-                               std::move(minibatch_[i].probabilities_to_cache));
-      }
-    }
 
+    LCTRACE_FUNCTION_SCOPE;
     // Check for stop at the end so we have at least one node.
     for (size_t i = new_start; i < minibatch_.size(); i++) {
       auto& picked_node = minibatch_[i];
@@ -1458,6 +1444,7 @@ void SearchWorker::GatherMinibatch() {
 
 void SearchWorker::ProcessPickedTask(int start_idx, int end_idx,
                                      TaskWorkspace* workspace) {
+  LCTRACE_FUNCTION_SCOPE;
   auto& history = workspace->history;
   history = search_->played_history_;
 
@@ -1473,32 +1460,24 @@ void SearchWorker::ProcessPickedTask(int start_idx, int end_idx,
       ExtendNode(node, picked_node.depth, picked_node.moves_to_visit, &history);
       if (!node->IsTerminal()) {
         picked_node.nn_queried = true;
-        const auto hash = history.HashLast(params_.GetCacheHistoryLength() + 1);
-        picked_node.hash = hash;
-        picked_node.lock = NNCacheLock(search_->cache_, hash);
-        picked_node.is_cache_hit = picked_node.lock;
-        if (!picked_node.is_cache_hit) {
-          int transform;
-          picked_node.input_planes = EncodePositionForNN(
-              search_->network_->GetCapabilities().input_format, history, 8,
-              params_.GetHistoryFill(), &transform);
-          picked_node.probability_transform = transform;
-
-          std::vector<uint16_t>& moves = picked_node.probabilities_to_cache;
-          // Legal moves are known, use them.
-          moves.reserve(node->GetNumEdges());
-          for (const auto& edge : node->Edges()) {
-            moves.emplace_back(edge.GetMove().as_nn_index(transform));
-          }
-        } else {
-          picked_node.probability_transform = TransformForPosition(
-              search_->network_->GetCapabilities().input_format, history);
-        }
+        MoveList legal_moves;
+        legal_moves.reserve(node->GetNumEdges());
+        std::transform(node->Edges().begin(), node->Edges().end(),
+                       std::back_inserter(legal_moves),
+                       [](const auto& edge) { return edge.GetMove(); });
+        picked_node.eval->p.resize(legal_moves.size());
+        picked_node.is_cache_hit = computation_->AddInput(
+                                       EvalPosition{
+                                           .pos = history.GetPositions(),
+                                           .legal_moves = legal_moves,
+                                       },
+                                       picked_node.eval->AsPtr()) ==
+                                   BackendComputation::FETCHED_IMMEDIATELY;
       }
     }
     if (params_.GetOutOfOrderEval() && picked_node.CanEvalOutOfOrder()) {
       // Perform out of order eval for the last entry in minibatch_.
-      FetchSingleNodeResult(&picked_node, picked_node, 0);
+      FetchSingleNodeResult(&picked_node);
       picked_node.ooo_completed = true;
     }
   }
@@ -1527,7 +1506,7 @@ int SearchWorker::WaitForTasks() {
 
 void SearchWorker::PickNodesToExtend(int collision_limit) {
   ResetTasks();
-  if (task_workers_ > 0 && !search_->network_->IsCpu()) {
+  if (task_workers_ > 0 && !search_->backend_attributes_.runs_on_cpu) {
     // While nothing is ready yet - wake the task runners so they are ready to
     // receive quickly.
     Mutex::Lock lock(picking_tasks_mutex_);
@@ -1596,6 +1575,7 @@ void SearchWorker::PickNodesToExtendTask(
     const std::vector<Move>& moves_to_base,
     std::vector<NodeToProcess>* receiver,
     TaskWorkspace* workspace) NO_THREAD_SAFETY_ANALYSIS {
+  LCTRACE_FUNCTION_SCOPE;
   // TODO: Bring back pre-cached nodes created outside locks in a way that works
   // with tasks.
   // TODO: pre-reserve visits_to_perform for expected depth and likely maximum
@@ -2035,42 +2015,9 @@ void SearchWorker::ExtendNode(Node* node, int depth,
   node->CreateEdges(legal_moves);
 }
 
-// Returns whether node was already in cache.
-bool SearchWorker::AddNodeToComputation(Node* node) {
-  const auto hash = history_.HashLast(params_.GetCacheHistoryLength() + 1);
-  if (search_->cache_->ContainsKey(hash)) {
-    return true;
-  }
-  int transform;
-  auto planes =
-      EncodePositionForNN(search_->network_->GetCapabilities().input_format,
-                          history_, 8, params_.GetHistoryFill(), &transform);
-
-  std::vector<uint16_t> moves;
-
-  if (node && node->HasChildren()) {
-    // Legal moves are known, use them.
-    moves.reserve(node->GetNumEdges());
-    for (const auto& edge : node->Edges()) {
-      moves.emplace_back(edge.GetMove().as_nn_index(transform));
-    }
-  } else {
-    // Cache pseudolegal moves. A bit of a waste, but faster.
-    const auto& pseudolegal_moves =
-        history_.Last().GetBoard().GeneratePseudolegalMoves();
-    moves.reserve(pseudolegal_moves.size());
-    for (auto iter = pseudolegal_moves.begin(), end = pseudolegal_moves.end();
-         iter != end; ++iter) {
-      moves.emplace_back(iter->as_nn_index(transform));
-    }
-  }
-
-  computation_->AddInput(hash, std::move(planes), std::move(moves));
-  return false;
-}
-
 // 2b. Copy collisions into shared collisions.
 void SearchWorker::CollectCollisions() {
+  LCTRACE_FUNCTION_SCOPE;
   SharedMutex::Lock lock(search_->nodes_mutex_);
 
   for (const NodeToProcess& node_to_process : minibatch_) {
@@ -2084,17 +2031,19 @@ void SearchWorker::CollectCollisions() {
 // 3. Prefetch into cache.
 // ~~~~~~~~~~~~~~~~~~~~~~~
 void SearchWorker::MaybePrefetchIntoCache() {
+  LCTRACE_FUNCTION_SCOPE;
   // TODO(mooskagh) Remove prefetch into cache if node collisions work well.
   // If there are requests to NN, but the batch is not full, try to prefetch
   // nodes which are likely useful in future.
   if (search_->stop_.load(std::memory_order_acquire)) return;
-  if (computation_->GetCacheMisses() > 0 &&
-      computation_->GetCacheMisses() < params_.GetMaxPrefetchBatch()) {
+  if (computation_->UsedBatchSize() > 0 &&
+      static_cast<int>(computation_->UsedBatchSize()) <
+          params_.GetMaxPrefetchBatch()) {
     history_.Trim(search_->played_history_.GetLength());
     SharedMutex::SharedLock lock(search_->nodes_mutex_);
     PrefetchIntoCache(
         search_->root_node_,
-        params_.GetMaxPrefetchBatch() - computation_->GetCacheMisses(), false);
+        params_.GetMaxPrefetchBatch() - computation_->UsedBatchSize(), false);
   }
 }
 
@@ -2106,13 +2055,17 @@ int SearchWorker::PrefetchIntoCache(Node* node, int budget, bool is_odd_depth) {
 
   // We are in a leaf, which is not yet being processed.
   if (!node || node->GetNStarted() == 0) {
-    if (AddNodeToComputation(node)) {
+    if (search_->backend_->GetCachedEvaluation(
+            EvalPosition{history_.GetPositions(), {}})) {
       // Make it return 0 to make it not use the slot, so that the function
       // tries hard to find something to cache even among unpopular moves.
       // In practice that slows things down a lot though, as it's not always
       // easy to find what to cache.
       return 1;
     }
+    auto moves = history_.Last().GetBoard().GenerateLegalMoves();
+    computation_->AddInput(EvalPosition{history_.GetPositions(), moves},
+                           EvalResultPtr{});
     return 1;
   }
 
@@ -2189,37 +2142,34 @@ int SearchWorker::PrefetchIntoCache(Node* node, int budget, bool is_odd_depth) {
 
 // 4. Run NN computation.
 // ~~~~~~~~~~~~~~~~~~~~~~
-void SearchWorker::RunNNComputation() { computation_->ComputeBlocking(); }
+void SearchWorker::RunNNComputation() {
+  if (computation_->UsedBatchSize() > 0) computation_->ComputeBlocking();
+}
 
 // 5. Retrieve NN computations (and terminal values) into nodes.
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 void SearchWorker::FetchMinibatchResults() {
+  LCTRACE_FUNCTION_SCOPE;
   // Populate NN/cached results, or terminal results, into nodes.
-  int idx_in_computation = 0;
   for (auto& node_to_process : minibatch_) {
-    FetchSingleNodeResult(&node_to_process, *computation_, idx_in_computation);
-    if (node_to_process.nn_queried) ++idx_in_computation;
+    FetchSingleNodeResult(&node_to_process);
   }
 }
 
-template <typename Computation>
-void SearchWorker::FetchSingleNodeResult(NodeToProcess* node_to_process,
-                                         const Computation& computation,
-                                         int idx_in_computation) {
+void SearchWorker::FetchSingleNodeResult(NodeToProcess* node_to_process) {
   if (node_to_process->IsCollision()) return;
   Node* node = node_to_process->node;
   if (!node_to_process->nn_queried) {
     // Terminal nodes don't involve the neural NetworkComputation, nor do
     // they require any further processing after value retrieval.
-    node_to_process->v = node->GetWL();
-    node_to_process->d = node->GetD();
-    node_to_process->m = node->GetM();
+    node_to_process->eval->q = node->GetWL();
+    node_to_process->eval->d = node->GetD();
+    node_to_process->eval->m = node->GetM();
     return;
   }
+  node_to_process->eval->q = -node_to_process->eval->q;
   // For NN results, we need to populate policy as well as value.
   // First the value...
-  auto v = -computation.GetQVal(idx_in_computation);
-  auto d = computation.GetDVal(idx_in_computation);
   if (params_.GetWDLRescaleRatio() != 1.0f ||
       (params_.GetWDLRescaleDiff() != 0.0f &&
        search_->contempt_mode_ != ContemptMode::NONE)) {
@@ -2227,43 +2177,15 @@ void SearchWorker::FetchSingleNodeResult(NodeToProcess* node_to_process,
     bool root_stm = (search_->contempt_mode_ == ContemptMode::BLACK) ==
                     search_->played_history_.Last().IsBlackToMove();
     auto sign = (root_stm ^ (node_to_process->depth & 1)) ? 1.0f : -1.0f;
-    WDLRescale(v, d, params_.GetWDLRescaleRatio(),
+    WDLRescale(node_to_process->eval->q, node_to_process->eval->d,
+               params_.GetWDLRescaleRatio(),
                search_->contempt_mode_ == ContemptMode::NONE
                    ? 0
                    : params_.GetWDLRescaleDiff(),
                sign, false, params_.GetWDLMaxS());
   }
-  node_to_process->v = v;
-  node_to_process->d = d;
-  node_to_process->m = computation.GetMVal(idx_in_computation);
-  // ...and secondly, the policy data.
-  // Calculate maximum first.
-  float max_p = -std::numeric_limits<float>::infinity();
-  // Intermediate array to store values when processing policy.
-  // There are never more than 256 valid legal moves in any legal position.
-  std::array<float, 256> intermediate;
-  int counter = 0;
-  for (auto& edge : node->Edges()) {
-    float p = computation.GetPVal(
-        idx_in_computation,
-        edge.GetMove().as_nn_index(node_to_process->probability_transform));
-    intermediate[counter++] = p;
-    max_p = std::max(max_p, p);
-  }
-  float total = 0.0;
-  for (int i = 0; i < counter; i++) {
-    // Perform softmax and take into account policy softmax temperature T.
-    // Note that we want to calculate (exp(p-max_p))^(1/T) = exp((p-max_p)/T).
-    float p =
-        FastExp((intermediate[i] - max_p) / params_.GetPolicySoftmaxTemp());
-    intermediate[i] = p;
-    total += p;
-  }
-  counter = 0;
-  // Normalize P values to add up to 1.0.
-  const float scale = total > 0.0f ? 1.0f / total : 1.0f;
-  for (auto& edge : node->Edges()) {
-    edge.edge()->SetP(intermediate[counter++] * scale);
+  for (size_t p_idx = 0; auto& edge : node->Edges()) {
+    edge.edge()->SetP(node_to_process->eval->p[p_idx++]);
   }
   // Add Dirichlet noise if enabled and at root.
   if (params_.GetNoiseEpsilon() && node == search_->root_node_) {
@@ -2276,6 +2198,7 @@ void SearchWorker::FetchSingleNodeResult(NodeToProcess* node_to_process,
 // 6. Propagate the new nodes' information to all their parents in the tree.
 // ~~~~~~~~~~~~~~
 void SearchWorker::DoBackupUpdate() {
+  LCTRACE_FUNCTION_SCOPE;
   // Nodes mutex for doing node updates.
   SharedMutex::Lock lock(search_->nodes_mutex_);
 
@@ -2304,9 +2227,9 @@ void SearchWorker::DoBackupUpdateSingleNode(
       params_.GetStickyEndgames() && node->IsTerminal() && !node->GetN();
 
   // Backup V value up to a root. After 1 visit, V = Q.
-  float v = node_to_process.v;
-  float d = node_to_process.d;
-  float m = node_to_process.m;
+  float v = node_to_process.eval->q;
+  float d = node_to_process.eval->d;
+  float m = node_to_process.eval->m;
   int n_to_fix = 0;
   float v_delta = 0.0f;
   float d_delta = 0.0f;
@@ -2369,6 +2292,9 @@ void SearchWorker::DoBackupUpdateSingleNode(
     }
   }
   search_->total_playouts_ += node_to_process.multivisit;
+  if (node_to_process.nn_queried && !node_to_process.is_cache_hit) {
+    search_->network_evaluations_++;
+  }
   search_->cum_depth_ += node_to_process.depth * node_to_process.multivisit;
   search_->max_depth_ = std::max(search_->max_depth_, node_to_process.depth);
 }
@@ -2445,6 +2371,7 @@ bool SearchWorker::MaybeSetBounds(Node* p, float m, int* n_to_fix,
 // 7. Update the Search's status and progress information.
 //~~~~~~~~~~~~~~~~~~~~
 void SearchWorker::UpdateCounters() {
+  LCTRACE_FUNCTION_SCOPE;
   search_->PopulateCommonIterationStats(&iteration_stats_);
   search_->MaybeTriggerStop(iteration_stats_, &latest_time_manager_hints_);
   search_->MaybeOutputInfo();
@@ -2466,4 +2393,5 @@ void SearchWorker::UpdateCounters() {
   }
 }
 
+}  // namespace classic
 }  // namespace lczero
diff --git a/src/mcts/search.h b/src/search/classic/search.h
similarity index 85%
rename from src/mcts/search.h
rename to src/search/classic/search.h
index c2ff2aa116..34293f3173 100644
--- a/src/mcts/search.h
+++ b/src/search/classic/search.h
@@ -36,26 +36,25 @@
 
 #include "chess/callbacks.h"
 #include "chess/uciloop.h"
-#include "mcts/node.h"
-#include "mcts/params.h"
-#include "mcts/stoppers/timemgr.h"
-#include "neural/cache.h"
-#include "neural/network.h"
+#include "neural/backend.h"
+#include "search/classic/node.h"
+#include "search/classic/params.h"
+#include "search/classic/stoppers/timemgr.h"
 #include "syzygy/syzygy.h"
 #include "utils/logging.h"
 #include "utils/mutex.h"
 
 namespace lczero {
+namespace classic {
 
 class Search {
  public:
-  Search(const NodeTree& tree, Network* network,
+  Search(const NodeTree& tree, Backend* network,
          std::unique_ptr<UciResponder> uci_responder,
          const MoveList& searchmoves,
          std::chrono::steady_clock::time_point start_time,
          std::unique_ptr<SearchStopper> stopper, bool infinite, bool ponder,
-         const OptionsDict& options, NNCache* cache,
-         SyzygyTablebase* syzygy_tb);
+         const OptionsDict& options, SyzygyTablebase* syzygy_tb);
 
   ~Search();
 
@@ -95,9 +94,6 @@ class Search {
   // from temperature having been applied again.
   void ResetBestMove();
 
-  // Returns NN eval for a given node from cache, if that node is cached.
-  NNCacheLock GetCachedNNEval(const Node* node) const;
-
  private:
   // Computes the best move, maybe with temperature (according to the settings).
   void EnsureBestMoveKnown();
@@ -130,7 +126,7 @@ class Search {
 
   // Returns verbose information about given node, as vector of strings.
   // Node can only be root or ponder (depth 1).
-  std::vector<std::string> GetVerboseStats(Node* node) const;
+  std::vector<std::string> GetVerboseStats(const Node* node) const;
 
   // Returns the draw score at the root of the search. At odd depth pass true to
   // the value of @is_odd_depth to change the sign of the draw score.
@@ -140,6 +136,8 @@ class Search {
   // Ensure that all shared collisions are cancelled and clear them out.
   void CancelSharedCollisions();
 
+  PositionHistory GetPositionHistoryAtNode(const Node* node) const;
+
   mutable Mutex counters_mutex_ ACQUIRED_AFTER(nodes_mutex_);
   // Tells all threads to stop.
   std::atomic<bool> stop_{false};
@@ -162,12 +160,12 @@ class Search {
   std::vector<std::thread> threads_ GUARDED_BY(threads_mutex_);
 
   Node* root_node_;
-  NNCache* cache_;
   SyzygyTablebase* syzygy_tb_;
   // Fixed positions which happened before the search.
   const PositionHistory& played_history_;
 
-  Network* const network_;
+  Backend* const backend_;
+  BackendAttributes backend_attributes_;
   const SearchParams params_;
   const MoveList searchmoves_;
   const std::chrono::steady_clock::time_point start_time_;
@@ -183,6 +181,7 @@ class Search {
   Edge* last_outputted_info_edge_ GUARDED_BY(nodes_mutex_) = nullptr;
   ThinkingInfo last_outputted_uci_info_ GUARDED_BY(nodes_mutex_);
   int64_t total_playouts_ GUARDED_BY(nodes_mutex_) = 0;
+  int64_t network_evaluations_ GUARDED_BY(nodes_mutex_) = 0;
   int64_t total_batches_ GUARDED_BY(nodes_mutex_) = 0;
   // Maximum search depth = length of longest path taken in PickNodetoExtend.
   uint16_t max_depth_ GUARDED_BY(nodes_mutex_) = 0;
@@ -209,16 +208,14 @@ class Search {
 // within one thread, have to split into stages.
 class SearchWorker {
  public:
-  SearchWorker(Search* search, const SearchParams& params, int id)
+  SearchWorker(Search* search, const SearchParams& params)
       : search_(search),
         history_(search_->played_history_),
         params_(params),
-        moves_left_support_(search_->network_->GetCapabilities().moves_left !=
-                            pblczero::NetworkFormat::MOVES_LEFT_NONE) {
-    search_->network_->InitThread(id);
+        moves_left_support_(search_->backend_attributes_.has_mlh) {
     task_workers_ = params.GetTaskWorkersPerSearchWorker();
     if (task_workers_ < 0) {
-      if (search_->network_->IsCpu()) {
+      if (search_->backend_attributes_.runs_on_cpu) {
         task_workers_ = 0;
       } else {
         int working_threads = std::max(
@@ -229,13 +226,12 @@ class SearchWorker {
     }
     for (int i = 0; i < task_workers_; i++) {
       task_workspaces_.emplace_back();
-      task_threads_.emplace_back([this, i]() {
-        this->RunTasks(i);
-      });
+      task_threads_.emplace_back([this, i]() { this->RunTasks(i); });
     }
     target_minibatch_size_ = params_.GetMiniBatchSize();
     if (target_minibatch_size_ == 0) {
-      target_minibatch_size_ = search_->network_->GetMiniBatchSize();
+      target_minibatch_size_ =
+          search_->backend_attributes_.recommended_batch_size;
     }
     max_out_of_order_ =
         std::max(1, static_cast<int>(params_.GetMaxOutOfOrderEvalsFactor() *
@@ -283,7 +279,7 @@ class SearchWorker {
   // The same operations one by one:
   // 1. Initialize internal structures.
   // @computation is the computation to use on this iteration.
-  void InitializeIteration(std::unique_ptr<NetworkComputation> computation);
+  void InitializeIteration();
 
   // 2. Gather minibatch.
   void GatherMinibatch();
@@ -316,12 +312,7 @@ class SearchWorker {
 
     // The node to extend.
     Node* node;
-    // Value from NN's value head, or -1/0/1 for terminal nodes.
-    float v;
-    // Draw probability for NN's with WDL value head.
-    float d;
-    // Estimated remaining plies left.
-    float m;
+    std::unique_ptr<EvalResult> eval;
     int multivisit = 0;
     // If greater than multivisit, and other parameters don't imply a lower
     // limit, multivist could be increased to this value without additional
@@ -331,19 +322,10 @@ class SearchWorker {
     bool nn_queried = false;
     bool is_cache_hit = false;
     bool is_collision = false;
-    int probability_transform = 0;
-
-    // Details only populated in the multigather path.
-
     // Only populated for visits,
     std::vector<Move> moves_to_visit;
 
     // Details that are filled in as we go.
-    uint64_t hash;
-    NNCacheLock lock;
-    std::vector<uint16_t> probabilities_to_cache;
-    InputPlanes input_planes;
-    mutable int last_idx = 0;
     bool ooo_completed = false;
 
     static NodeToProcess Collision(Node* node, uint16_t depth,
@@ -358,34 +340,11 @@ class SearchWorker {
       return NodeToProcess(node, depth, false, 1, 0);
     }
 
-    // Methods to allow NodeToProcess to conform as a 'Computation'. Only safe
-    // to call if is_cache_hit is true in the multigather path.
-
-    float GetQVal(int) const { return lock->q; }
-
-    float GetDVal(int) const { return lock->d; }
-
-    float GetMVal(int) const { return lock->m; }
-
-    float GetPVal(int, int move_id) const {
-      const auto& moves = lock->p;
-
-      int total_count = 0;
-      while (total_count < moves.size()) {
-        // Optimization: usually moves are stored in the same order as queried.
-        const auto& move = moves[last_idx++];
-        if (last_idx == moves.size()) last_idx = 0;
-        if (move.first == move_id) return move.second;
-        ++total_count;
-      }
-      assert(false);  // Move not found.
-      return 0;
-    }
-
    private:
     NodeToProcess(Node* node, uint16_t depth, bool is_collision, int multivisit,
                   int max_count)
         : node(node),
+          eval(std::make_unique<EvalResult>()),
           multivisit(multivisit),
           maxvisit(max_count),
           depth(depth),
@@ -440,15 +399,14 @@ class SearchWorker {
   };
 
   NodeToProcess PickNodeToExtend(int collision_limit);
-  bool AddNodeToComputation(Node* node);
   int PrefetchIntoCache(Node* node, int budget, bool is_odd_depth);
   void DoBackupUpdateSingleNode(const NodeToProcess& node_to_process);
   // Returns whether a node's bounds were set based on its children.
   bool MaybeSetBounds(Node* p, float m, int* n_to_fix, float* v_delta,
                       float* d_delta, float* m_delta) const;
   void PickNodesToExtend(int collision_limit);
-  void PickNodesToExtendTask(Node* starting_point, int collision_limit,
-                             int base_depth,
+  void PickNodesToExtendTask(Node* starting_point, int base_depth,
+                             int collision_limit,
                              const std::vector<Move>& moves_to_base,
                              std::vector<NodeToProcess>* receiver,
                              TaskWorkspace* workspace);
@@ -457,10 +415,7 @@ class SearchWorker {
                          TaskWorkspace* workspace);
   void ExtendNode(Node* node, int depth, const std::vector<Move>& moves_to_add,
                   PositionHistory* history);
-  template <typename Computation>
-  void FetchSingleNodeResult(NodeToProcess* node_to_process,
-                             const Computation& computation,
-                             int idx_in_computation);
+  void FetchSingleNodeResult(NodeToProcess* node_to_process);
   void RunTasks(int tid);
   void ResetTasks();
   // Returns how many tasks there were.
@@ -469,7 +424,7 @@ class SearchWorker {
   Search* const search_;
   // List of nodes to process.
   std::vector<NodeToProcess> minibatch_;
-  std::unique_ptr<CachingComputation> computation_;
+  std::unique_ptr<BackendComputation> computation_;
   int task_workers_;
   int target_minibatch_size_;
   int max_out_of_order_;
@@ -477,7 +432,6 @@ class SearchWorker {
   PositionHistory history_;
   int number_out_of_order_ = 0;
   const SearchParams& params_;
-  std::unique_ptr<Node> precached_node_;
   const bool moves_left_support_;
   IterationStats iteration_stats_;
   StoppersHints latest_time_manager_hints_;
@@ -497,4 +451,5 @@ class SearchWorker {
   bool exiting_ = false;
 };
 
+}  // namespace classic
 }  // namespace lczero
diff --git a/src/mcts/stoppers/alphazero.cc b/src/search/classic/stoppers/alphazero.cc
similarity index 84%
rename from src/mcts/stoppers/alphazero.cc
rename to src/search/classic/stoppers/alphazero.cc
index 84e04c0e2a..a0cacf911c 100644
--- a/src/mcts/stoppers/alphazero.cc
+++ b/src/search/classic/stoppers/alphazero.cc
@@ -25,9 +25,10 @@
   Program grant you additional permission to convey the resulting work.
 */
 
-#include "mcts/stoppers/stoppers.h"
+#include "search/classic/stoppers/stoppers.h"
 
 namespace lczero {
+namespace classic {
 
 namespace {
 
@@ -41,7 +42,10 @@ class AlphazeroTimeManager : public TimeManager {
       throw Exception("alphazero-time-pct value to be in range [0.0, 100.0]");
   }
   std::unique_ptr<SearchStopper> GetStopper(const GoParams& params,
-                                            const NodeTree& tree) override;
+                                            const Position& position,
+                                            size_t /*total_memory*/,
+                                            size_t /*avg_node_size*/,
+                                            uint32_t /*nodes*/) override;
 
  private:
   const int64_t move_overhead_;
@@ -49,8 +53,8 @@ class AlphazeroTimeManager : public TimeManager {
 };
 
 std::unique_ptr<SearchStopper> AlphazeroTimeManager::GetStopper(
-    const GoParams& params, const NodeTree& tree) {
-  const Position& position = tree.HeadPosition();
+    const GoParams& params, const Position& position, size_t /*total_memory*/,
+    size_t /*avg_node_size*/, uint32_t /*nodes*/) {
   const bool is_black = position.IsBlackToMove();
   const std::optional<int64_t>& time = (is_black ? params.btime : params.wtime);
   // If no time limit is given, don't stop on this condition.
@@ -73,4 +77,5 @@ std::unique_ptr<TimeManager> MakeAlphazeroTimeManager(
     int64_t move_overhead, const OptionsDict& params) {
   return std::make_unique<AlphazeroTimeManager>(move_overhead, params);
 }
+}  // namespace classic
 }  // namespace lczero
diff --git a/src/mcts/stoppers/alphazero.h b/src/search/classic/stoppers/alphazero.h
similarity index 96%
rename from src/mcts/stoppers/alphazero.h
rename to src/search/classic/stoppers/alphazero.h
index f342cbe0d8..77404a0673 100644
--- a/src/mcts/stoppers/alphazero.h
+++ b/src/search/classic/stoppers/alphazero.h
@@ -30,8 +30,10 @@
 #include "utils/optionsdict.h"
 
 namespace lczero {
+namespace classic {
 
 std::unique_ptr<TimeManager> MakeAlphazeroTimeManager(
     int64_t move_overhead, const OptionsDict& params);
 
+}  // namespace classic
 }  // namespace lczero
\ No newline at end of file
diff --git a/src/mcts/stoppers/common.cc b/src/search/classic/stoppers/common.cc
similarity index 58%
rename from src/mcts/stoppers/common.cc
rename to src/search/classic/stoppers/common.cc
index 2fd95f578e..313176bd9c 100644
--- a/src/mcts/stoppers/common.cc
+++ b/src/search/classic/stoppers/common.cc
@@ -25,48 +25,63 @@
   Program grant you additional permission to convey the resulting work.
 */
 
-#include "src/mcts/stoppers/common.h"
+#include "search/classic/stoppers/common.h"
 
-namespace lczero {
+#include "neural/shared_params.h"
 
-const OptionId kNNCacheSizeId{
-    "nncache", "NNCacheSize",
-    "Number of positions to store in a memory cache. A large cache can speed "
-    "up searching, but takes memory."};
+namespace lczero {
+namespace classic {
 
 namespace {
 const OptionId kRamLimitMbId{
-    "ramlimit-mb", "RamLimitMb",
-    "Maximum memory usage for the engine, in megabytes. The estimation is very "
-    "rough, and can be off by a lot. For example, multiple visits to a "
-    "terminal node counted several times, and the estimation assumes that all "
-    "positions have 30 possible moves. When set to 0, no RAM limit is "
-    "enforced."};
+    {.long_flag = "ramlimit-mb",
+     .uci_option = "RamLimitMb",
+     .help_text =
+         "Maximum memory usage for the engine, in megabytes. The estimation is "
+         "very rough, and can be off by a lot. For example, multiple visits to "
+         "a terminal node counted several times, and the estimation assumes "
+         "that all positions have 30 possible moves. When set to 0, no RAM "
+         "limit is enforced.",
+     .visibility = OptionId::kAlwaysVisible}};
 const OptionId kMinimumKLDGainPerNodeId{
-    "minimum-kldgain-per-node", "MinimumKLDGainPerNode",
-    "If greater than 0 search will abort unless the last "
-    "KLDGainAverageInterval nodes have an average gain per node of at least "
-    "this much."};
+    {.long_flag = "minimum-kldgain-per-node",
+     .uci_option = "MinimumKLDGainPerNode",
+     .help_text = "If greater than 0 search will abort unless the last "
+                  "KLDGainAverageInterval nodes have an average gain per node "
+                  "of at least this much.",
+     .visibility = OptionId::kProOnly}};
 const OptionId kKLDGainAverageIntervalId{
-    "kldgain-average-interval", "KLDGainAverageInterval",
-    "Used to decide how frequently to evaluate the average KLDGainPerNode to "
-    "check the MinimumKLDGainPerNode, if specified."};
+    {.long_flag = "kldgain-average-interval",
+     .uci_option = "KLDGainAverageInterval",
+     .help_text =
+         "Used to decide how frequently to evaluate the average KLDGainPerNode "
+         "to check the MinimumKLDGainPerNode, if specified.",
+     .visibility = OptionId::kProOnly}};
 const OptionId kSmartPruningFactorId{
-    "smart-pruning-factor", "SmartPruningFactor",
-    "Do not spend time on the moves which cannot become bestmove given the "
-    "remaining time to search. When no other move can overtake the current "
-    "best, the search stops, saving the time. Values greater than 1 stop less "
-    "promising moves from being considered even earlier. Values less than 1 "
-    "causes hopeless moves to still have some attention. When set to 0, smart "
-    "pruning is deactivated."};
+    {.long_flag = "smart-pruning-factor",
+     .uci_option = "SmartPruningFactor",
+     .help_text =
+         "Do not spend time on the moves which cannot become bestmove given "
+         "the remaining time to search. When no other move can overtake the "
+         "current best, the search stops, saving the time. Values greater than "
+         "1 stop less promising moves from being considered even earlier. "
+         "Values less than 1 causes hopeless moves to still have some "
+         "attention. When set to 0, smart pruning is deactivated.",
+     .visibility = OptionId::kDefaultVisibility}};
 const OptionId kMinimumSmartPruningBatchesId{
-    "smart-pruning-minimum-batches", "SmartPruningMinimumBatches",
-    "Only allow smart pruning to stop search after at least this many batches "
-    "have been evaluated. It may be useful to have this value greater than the "
-    "number of search threads in use."};
+    {.long_flag = "smart-pruning-minimum-batches",
+     .uci_option = "SmartPruningMinimumBatches",
+     .help_text =
+         "Only allow smart pruning to stop search after at least this many "
+         "batches have been evaluated. It may be useful to have this value "
+         "greater than the number of search threads in use.",
+     .visibility = OptionId::kDefaultVisibility}};
 const OptionId kNodesAsPlayoutsId{
-    "nodes-as-playouts", "NodesAsPlayouts",
-    "Treat UCI `go nodes` command as referring to playouts instead of visits."};
+    {.long_flag = "nodes-as-playouts",
+     .uci_option = "NodesAsPlayouts",
+     .help_text = "Treat UCI `go nodes` command as referring to playouts "
+                  "instead of visits.",
+     .visibility = OptionId::kProOnly}};
 
 }  // namespace
 
@@ -77,17 +92,7 @@ void PopulateCommonStopperOptions(RunType for_what, OptionsParser* options) {
       (for_what == RunType::kUci ? 1.33f : 0.00f);
   options->Add<IntOption>(kMinimumSmartPruningBatchesId, 0, 10000) = 0;
   options->Add<BoolOption>(kNodesAsPlayoutsId) = false;
-
-  if (for_what == RunType::kUci || for_what == RunType::kSimpleUci) {
-    options->Add<IntOption>(kRamLimitMbId, 0, 100000000) = 0;
-    options->HideOption(kMinimumKLDGainPerNodeId);
-    options->HideOption(kKLDGainAverageIntervalId);
-    options->HideOption(kNodesAsPlayoutsId);
-  }
-  if (for_what == RunType::kSimpleUci) {
-    options->HideOption(kSmartPruningFactorId);
-    options->HideOption(kMinimumSmartPruningBatchesId);
-  }
+  options->Add<IntOption>(kRamLimitMbId, 0, 100000000) = 0;
 }
 
 // Parameters needed for selfplay and uci, but not benchmark nor infinite mode.
@@ -112,21 +117,22 @@ namespace {
 // Stoppers for uci mode only.
 void PopulateCommonUciStoppers(ChainedSearchStopper* stopper,
                                const OptionsDict& options,
-                               const GoParams& params, int64_t move_overhead) {
+                               const GoParams& params, size_t total_memory,
+                               size_t avg_node_size, uint32_t nodes,
+                               int64_t move_overhead) {
   const bool infinite = params.infinite || params.ponder || params.mate;
 
   // RAM limit watching stopper.
-  const auto cache_size_mb = options.Get<int>(kNNCacheSizeId);
-  const int ram_limit = options.Get<int>(kRamLimitMbId);
-  if (ram_limit) {
+  const int ram_limit_mb = options.Get<int>(kRamLimitMbId);
+  if (ram_limit_mb) {
     stopper->AddStopper(std::make_unique<MemoryWatchingStopper>(
-        cache_size_mb, ram_limit,
+        ram_limit_mb, total_memory, avg_node_size, nodes,
         options.Get<float>(kSmartPruningFactorId) > 0.0f));
   }
 
   // "go nodes" stopper.
-  int64_t node_limit = 0;
-  if (params.nodes) {
+  int64_t node_limit = 4000000000;
+  if (params.nodes.has_value()) {
     if (options.Get<bool>(kNodesAsPlayoutsId)) {
       stopper->AddStopper(std::make_unique<PlayoutsStopper>(
           *params.nodes, options.Get<float>(kSmartPruningFactorId) > 0.0f));
@@ -134,8 +140,7 @@ void PopulateCommonUciStoppers(ChainedSearchStopper* stopper,
       node_limit = *params.nodes;
     }
   }
-  // always limit nodes to avoid exceeding the limit 4000000000. That number is
-  // default when node_limit = 0.
+  // Always limit nodes to avoid exceeding the limit 4000000000.
   stopper->AddStopper(std::make_unique<VisitsStopper>(
       node_limit, options.Get<float>(kSmartPruningFactorId) > 0.0f));
 
@@ -169,10 +174,16 @@ class CommonTimeManager : public TimeManager {
 
  private:
   std::unique_ptr<SearchStopper> GetStopper(const GoParams& params,
-                                            const NodeTree& tree) override {
+                                            const Position& position,
+                                            size_t avg_node_size,
+                                            size_t total_memory,
+                                            uint32_t nodes) override {
     auto result = std::make_unique<ChainedSearchStopper>();
-    if (child_mgr_) result->AddStopper(child_mgr_->GetStopper(params, tree));
-    PopulateCommonUciStoppers(result.get(), options_, params, move_overhead_);
+    if (child_mgr_)
+      result->AddStopper(child_mgr_->GetStopper(params, position, avg_node_size,
+                                                total_memory, nodes));
+    PopulateCommonUciStoppers(result.get(), options_, params, avg_node_size,
+                              total_memory, nodes, move_overhead_);
     return result;
   }
 
@@ -190,4 +201,5 @@ std::unique_ptr<TimeManager> MakeCommonTimeManager(
                                              move_overhead);
 }
 
+}  // namespace classic
 }  // namespace lczero
diff --git a/src/mcts/stoppers/common.h b/src/search/classic/stoppers/common.h
similarity index 87%
rename from src/mcts/stoppers/common.h
rename to src/search/classic/stoppers/common.h
index fe35115ef5..8253056d8d 100644
--- a/src/mcts/stoppers/common.h
+++ b/src/search/classic/stoppers/common.h
@@ -27,19 +27,16 @@
 
 #pragma once
 
-#include "mcts/stoppers/stoppers.h"
+#include "search/classic/stoppers/stoppers.h"
 #include "utils/optionsdict.h"
 #include "utils/optionsparser.h"
 
 namespace lczero {
+namespace classic {
 
-enum class RunType { kUci, kSimpleUci, kSelfplay };
+enum class RunType { kUci, kSelfplay };
 void PopulateCommonStopperOptions(RunType for_what, OptionsParser* options);
 
-// Option ID for a cache size. It's used from multiple places and there's no
-// really nice place to declare, so let it be here.
-extern const OptionId kNNCacheSizeId;
-
 // Populates KLDGain and SmartPruning stoppers.
 void PopulateIntrinsicStoppers(ChainedSearchStopper* stopper,
                                const OptionsDict& options);
@@ -48,4 +45,5 @@ std::unique_ptr<TimeManager> MakeCommonTimeManager(
     std::unique_ptr<TimeManager> child_manager, const OptionsDict& options,
     int64_t move_overhead);
 
+}  // namespace classic
 }  // namespace lczero
diff --git a/src/mcts/stoppers/factory.cc b/src/search/classic/stoppers/factory.cc
similarity index 60%
rename from src/mcts/stoppers/factory.cc
rename to src/search/classic/stoppers/factory.cc
index 3da29d1aa1..7fbe16700e 100644
--- a/src/mcts/stoppers/factory.cc
+++ b/src/search/classic/stoppers/factory.cc
@@ -25,59 +25,65 @@
   Program grant you additional permission to convey the resulting work.
 */
 
-#include "mcts/stoppers/factory.h"
+#include "search/classic/stoppers/factory.h"
 
 #include <optional>
 
 #include "factory.h"
-#include "mcts/stoppers/alphazero.h"
-#include "mcts/stoppers/legacy.h"
-#include "mcts/stoppers/simple.h"
-#include "mcts/stoppers/smooth.h"
-#include "mcts/stoppers/stoppers.h"
+#include "search/classic/stoppers/alphazero.h"
+#include "search/classic/stoppers/legacy.h"
+#include "search/classic/stoppers/simple.h"
+#include "search/classic/stoppers/smooth.h"
+#include "search/classic/stoppers/stoppers.h"
 #include "utils/exception.h"
 
 namespace lczero {
+namespace classic {
 namespace {
 
 const OptionId kMoveOverheadId{
-    "move-overhead", "MoveOverheadMs",
-    "Amount of time, in milliseconds, that the engine subtracts from it's "
-    "total available time (to compensate for slow connection, interprocess "
-    "communication, etc)."};
+    {.long_flag = "move-overhead",
+     .uci_option = "MoveOverheadMs",
+     .help_text =
+         "Amount of time, in milliseconds, that the engine subtracts from its "
+         "total available time (to compensate for slow connection, "
+         "interprocess communication, etc).",
+     .visibility = OptionId::kAlwaysVisible}};
 const OptionId kTimeManagerId{
-    "time-manager", "TimeManager",
-    "Name and config of a time manager. "
-    "Possible names are 'legacy' (default), 'smooth', 'alphazero', and simple."
-    "See https://lc0.org/timemgr for configuration details."};
+    {.long_flag = "time-manager",
+     .uci_option = "TimeManager",
+     .help_text =
+         "Name and config of a time manager. Possible names are 'legacy' "
+         "(default), 'smooth', 'alphazero', and simple. See "
+         "https://lc0.org/timemgr for configuration details."}};
 const OptionId kSlowMoverId{
-    "slowmover", "Slowmover",
-    "Budgeted time for a move is multiplied by this value, causing the engine "
-    "to spend more time (if value is greater than 1) or less time (if the "
-    "value is less than 1)."};
+    {.long_flag = "slowmover",
+     .uci_option = "Slowmover",
+     .help_text = "Budgeted time for a move is multiplied by this value, "
+                  "causing the engine to spend more time (if value is greater "
+                  "than 1) or less time (if the value is less than 1).",
+     .visibility = OptionId::kSimpleOnly}};
 }  // namespace
 
 void PopulateTimeManagementOptions(RunType for_what, OptionsParser* options) {
   PopulateCommonStopperOptions(for_what, options);
-  if (for_what == RunType::kUci || for_what == RunType::kSimpleUci) {
-    options->Add<IntOption>(kMoveOverheadId, 0, 100000000) = 200;
-    if (for_what == RunType::kUci) {
-      options->Add<StringOption>(kTimeManagerId) = "legacy";
-    } else {
-      options->Add<FloatOption>(kSlowMoverId, 0.0f, 100.0f) = 1.0f;
-    }
-  }
+  options->Add<IntOption>(kMoveOverheadId, 0, 100000000) = 200;
+  options->Add<StringOption>(kTimeManagerId) = "legacy";
+  options->Add<FloatOption>(kSlowMoverId, 0.0f, 100.0f) = 1.0f;
 }
 
 std::unique_ptr<TimeManager> MakeTimeManager(const OptionsDict& options) {
   const int64_t move_overhead = options.Get<int>(kMoveOverheadId);
 
   OptionsDict tm_options;
-  if (options.Exists<std::string>(kTimeManagerId)) {
-    tm_options.AddSubdictFromString(options.Get<std::string>(kTimeManagerId));
-  } else {
+  tm_options.AddSubdictFromString(options.Get<std::string>(kTimeManagerId));
+  if (!options.IsDefault<float>(kSlowMoverId)) {
+    // Assume that default behavior of simple and normal mode is the same.
+    if (!options.IsDefault<std::string>(kTimeManagerId)) {
+      throw Exception("You can't set both time manager and slowmover value");
+    }
     float slowmover = options.Get<float>(kSlowMoverId);
-    tm_options.AddSubdict("legacy")->Set("slowmover", slowmover);
+    tm_options.GetMutableSubdict("legacy")->Set("slowmover", slowmover);
   }
   const auto managers = tm_options.ListSubdicts();
 
@@ -109,4 +115,5 @@ std::unique_ptr<TimeManager> MakeTimeManager(const OptionsDict& options) {
   return MakeCommonTimeManager(std::move(time_manager), options, move_overhead);
 }
 
+}  // namespace classic
 }  // namespace lczero
diff --git a/src/mcts/stoppers/factory.h b/src/search/classic/stoppers/factory.h
similarity index 92%
rename from src/mcts/stoppers/factory.h
rename to src/search/classic/stoppers/factory.h
index a8b0c15c7e..6c5eba1c24 100644
--- a/src/mcts/stoppers/factory.h
+++ b/src/search/classic/stoppers/factory.h
@@ -27,12 +27,13 @@
 
 #pragma once
 
-#include "mcts/stoppers/common.h"
-#include "mcts/stoppers/timemgr.h"
+#include "search/classic/stoppers/common.h"
+#include "search/classic/stoppers/timemgr.h"
 #include "utils/optionsdict.h"
 #include "utils/optionsparser.h"
 
 namespace lczero {
+namespace classic {
 
 // Populates UCI/command line flags with time management options.
 void PopulateTimeManagementOptions(RunType for_what, OptionsParser* options);
@@ -40,4 +41,5 @@ void PopulateTimeManagementOptions(RunType for_what, OptionsParser* options);
 // Creates a new time manager for a new search.
 std::unique_ptr<TimeManager> MakeTimeManager(const OptionsDict& dict);
 
+}  // namespace classic
 }  // namespace lczero
diff --git a/src/mcts/stoppers/legacy.cc b/src/search/classic/stoppers/legacy.cc
similarity index 92%
rename from src/mcts/stoppers/legacy.cc
rename to src/search/classic/stoppers/legacy.cc
index 77bc712381..1f13435e79 100644
--- a/src/mcts/stoppers/legacy.cc
+++ b/src/search/classic/stoppers/legacy.cc
@@ -25,11 +25,14 @@
   Program grant you additional permission to convey the resulting work.
 */
 
-#include "mcts/stoppers/legacy.h"
+#include "search/classic/stoppers/legacy.h"
 
-#include "mcts/stoppers/stoppers.h"
+#include <cmath>
+
+#include "search/classic/stoppers/stoppers.h"
 
 namespace lczero {
+namespace classic {
 
 float ComputeEstimatedMovesToGo(int ply, float midpoint, float steepness) {
   // An analysis of chess games shows that the distribution of game lengths
@@ -75,7 +78,10 @@ class LegacyTimeManager : public TimeManager {
         first_move_bonus_(params.GetOrDefault<float>("first-move-bonus", 1.8f)),
         book_ply_bonus_(params.GetOrDefault<float>("book-ply-bonus", 0.25f)) {}
   std::unique_ptr<SearchStopper> GetStopper(const GoParams& params,
-                                            const NodeTree& tree) override;
+                                            const Position& position,
+                                            size_t /*total_memory*/,
+                                            size_t /*avg_node_size*/,
+                                            uint32_t /*nodes*/) override;
 
  private:
   const int64_t move_overhead_;
@@ -92,8 +98,8 @@ class LegacyTimeManager : public TimeManager {
 };
 
 std::unique_ptr<SearchStopper> LegacyTimeManager::GetStopper(
-    const GoParams& params, const NodeTree& tree) {
-  const Position& position = tree.HeadPosition();
+    const GoParams& params, const Position& position, size_t /*total_memory*/,
+    size_t /*avg_node_size*/, uint32_t /*nodes*/) {
   const bool is_black = position.IsBlackToMove();
   const std::optional<int64_t>& time = (is_black ? params.btime : params.wtime);
   // If no time limit is given, don't stop on this condition.
@@ -169,4 +175,5 @@ std::unique_ptr<TimeManager> MakeLegacyTimeManager(int64_t move_overhead,
                                                    const OptionsDict& params) {
   return std::make_unique<LegacyTimeManager>(move_overhead, params);
 }
+}  // namespace classic
 }  // namespace lczero
diff --git a/src/mcts/stoppers/legacy.h b/src/search/classic/stoppers/legacy.h
similarity index 94%
rename from src/mcts/stoppers/legacy.h
rename to src/search/classic/stoppers/legacy.h
index 82cbf3f85b..865ecd42ba 100644
--- a/src/mcts/stoppers/legacy.h
+++ b/src/search/classic/stoppers/legacy.h
@@ -27,14 +27,16 @@
 
 #pragma once
 
-#include "mcts/stoppers/timemgr.h"
+#include "search/classic/stoppers/timemgr.h"
 #include "utils/optionsdict.h"
 
 namespace lczero {
+namespace classic {
 
 float ComputeEstimatedMovesToGo(int ply, float midpoint, float steepness);
 
 std::unique_ptr<TimeManager> MakeLegacyTimeManager(int64_t move_overhead,
                                                    const OptionsDict& params);
 
+}  // namespace classic
 }  // namespace lczero
diff --git a/src/mcts/stoppers/simple.cc b/src/search/classic/stoppers/simple.cc
similarity index 88%
rename from src/mcts/stoppers/simple.cc
rename to src/search/classic/stoppers/simple.cc
index f976142f7c..bd7cd9bcb8 100644
--- a/src/mcts/stoppers/simple.cc
+++ b/src/search/classic/stoppers/simple.cc
@@ -25,9 +25,10 @@
   Program grant you additional permission to convey the resulting work.
 */
 
-#include "mcts/stoppers/stoppers.h"
+#include "search/classic/stoppers/stoppers.h"
 
 namespace lczero {
+namespace classic {
 
 namespace {
 
@@ -38,7 +39,8 @@ class SimpleTimeManager : public TimeManager {
         base_pct_(params.GetOrDefault<float>("base-pct", 1.4f)),
         ply_pct_(params.GetOrDefault<float>("ply-pct", 0.049f)),
         time_factor_(params.GetOrDefault<float>("time-factor", 1.5f)),
-        opening_bonus_pct_(params.GetOrDefault<float>("opening-bonus-pct", 0.0f)) {
+        opening_bonus_pct_(
+            params.GetOrDefault<float>("opening-bonus-pct", 0.0f)) {
     if (base_pct_ <= 0.0f || base_pct_ > 100.0f) {
       throw Exception("base-pct value to be in range [0.0, 100.0]");
     }
@@ -53,7 +55,10 @@ class SimpleTimeManager : public TimeManager {
     }
   }
   std::unique_ptr<SearchStopper> GetStopper(const GoParams& params,
-                                            const NodeTree& tree) override;
+                                            const Position& position,
+                                            size_t /*total_memory*/,
+                                            size_t /*avg_node_size*/,
+                                            uint32_t /*nodes*/) override;
 
  private:
   const int64_t move_overhead_;
@@ -67,8 +72,8 @@ class SimpleTimeManager : public TimeManager {
 };
 
 std::unique_ptr<SearchStopper> SimpleTimeManager::GetStopper(
-    const GoParams& params, const NodeTree& tree) {
-  const Position& position = tree.HeadPosition();
+    const GoParams& params, const Position& position, size_t /*total_memory*/,
+    size_t /*avg_node_size*/, uint32_t /*nodes*/) {
   const bool is_black = position.IsBlackToMove();
   const std::optional<int64_t>& time = (is_black ? params.btime : params.wtime);
 
@@ -126,4 +131,5 @@ std::unique_ptr<TimeManager> MakeSimpleTimeManager(int64_t move_overhead,
                                                    const OptionsDict& params) {
   return std::make_unique<SimpleTimeManager>(move_overhead, params);
 }
+}  // namespace classic
 }  // namespace lczero
diff --git a/src/mcts/stoppers/simple.h b/src/search/classic/stoppers/simple.h
similarity index 86%
rename from src/mcts/stoppers/simple.h
rename to src/search/classic/stoppers/simple.h
index b7b6f3a349..d7319458b6 100644
--- a/src/mcts/stoppers/simple.h
+++ b/src/search/classic/stoppers/simple.h
@@ -30,8 +30,10 @@
 #include "utils/optionsdict.h"
 
 namespace lczero {
+namespace classic {
 
-std::unique_ptr<TimeManager> MakeSimpleTimeManager(
-    int64_t move_overhead, const OptionsDict& params);
+std::unique_ptr<TimeManager> MakeSimpleTimeManager(int64_t move_overhead,
+                                                   const OptionsDict& params);
 
+}  // namespace classic
 }  // namespace lczero
diff --git a/src/mcts/stoppers/smooth.cc b/src/search/classic/stoppers/smooth.cc
similarity index 97%
rename from src/mcts/stoppers/smooth.cc
rename to src/search/classic/stoppers/smooth.cc
index 2a1a196247..8a32187089 100644
--- a/src/mcts/stoppers/smooth.cc
+++ b/src/search/classic/stoppers/smooth.cc
@@ -25,24 +25,26 @@
   Program grant you additional permission to convey the resulting work.
 */
 
-#include "mcts/stoppers/smooth.h"
+#include "search/classic/stoppers/smooth.h"
 
+#include <cmath>
 #include <functional>
 #include <iomanip>
 #include <optional>
 
-#include "mcts/stoppers/legacy.h"
-#include "mcts/stoppers/stoppers.h"
+#include "search/classic/stoppers/legacy.h"
+#include "search/classic/stoppers/stoppers.h"
 #include "utils/mutex.h"
 
 namespace lczero {
+namespace classic {
 namespace {
 
 class Params {
  public:
   Params(const OptionsDict& /* params */, int64_t move_overhead);
 
-  using MovesLeftEstimator = std::function<float(const NodeTree&)>;
+  using MovesLeftEstimator = std::function<float(const Position&)>;
 
   // Which fraction of the tree is reuse after a full move. Initial guess.
   float initial_tree_reuse() const { return initial_tree_reuse_; }
@@ -132,8 +134,8 @@ Params::MovesLeftEstimator CreateMovesLeftEstimator(const OptionsDict& params) {
                                     : params;
   return [midpoint = mle_dict.GetOrDefault<float>("midpoint", 45.2f),
           steepness = mle_dict.GetOrDefault<float>("steepness", 5.93f)](
-             const NodeTree& tree) {
-    const auto ply = tree.HeadPosition().GetGamePly();
+             const Position& position) {
+    const auto ply = position.GetGamePly();
     return ComputeEstimatedMovesToGo(ply, midpoint, steepness);
   };
 }
@@ -334,8 +336,10 @@ class SmoothTimeManager : public TimeManager {
 
  private:
   std::unique_ptr<SearchStopper> GetStopper(const GoParams& params,
-                                            const NodeTree& tree) override {
-    const Position& position = tree.HeadPosition();
+                                            const Position& position,
+                                            size_t /*total_memory*/,
+                                            size_t /*avg_node_size*/,
+                                            uint32_t current_nodes) override {
     const bool is_black = position.IsBlackToMove();
     const std::optional<int64_t>& time =
         (is_black ? params.btime : params.wtime);
@@ -350,7 +354,6 @@ class SmoothTimeManager : public TimeManager {
       is_first_move_ = false;
     }
 
-    const auto current_nodes = tree.GetCurrentHead()->GetN();
     if (last_move_final_nodes_ && last_time_ && avg_ms_per_move_ >= 0.0f) {
       UpdateTreeReuseFactor(current_nodes);
     }
@@ -358,7 +361,7 @@ class SmoothTimeManager : public TimeManager {
     last_time_ = 0;
 
     // Get remaining moves estimation.
-    float remaining_moves = params_.moves_left_estimator()(tree);
+    float remaining_moves = params_.moves_left_estimator()(position);
 
     // If the number of moves remaining until the time control are less than
     // the estimated number of moves left in the game, then use the number of
@@ -636,4 +639,5 @@ std::unique_ptr<TimeManager> MakeSmoothTimeManager(int64_t move_overhead,
   return std::make_unique<SmoothTimeManager>(move_overhead, params);
 }
 
+}  // namespace classic
 }  // namespace lczero
diff --git a/src/mcts/stoppers/smooth.h b/src/search/classic/stoppers/smooth.h
similarity index 94%
rename from src/mcts/stoppers/smooth.h
rename to src/search/classic/stoppers/smooth.h
index 2e68e83667..da81a9a8b0 100644
--- a/src/mcts/stoppers/smooth.h
+++ b/src/search/classic/stoppers/smooth.h
@@ -27,12 +27,14 @@
 
 #pragma once
 
-#include "mcts/stoppers/timemgr.h"
+#include "search/classic/stoppers/timemgr.h"
 #include "utils/optionsdict.h"
 
 namespace lczero {
+namespace classic {
 
 std::unique_ptr<TimeManager> MakeSmoothTimeManager(int64_t move_overhead,
                                                    const OptionsDict& params);
 
+}  // namespace classic
 }  // namespace lczero
\ No newline at end of file
diff --git a/src/mcts/stoppers/stoppers.cc b/src/search/classic/stoppers/stoppers.cc
similarity index 91%
rename from src/mcts/stoppers/stoppers.cc
rename to src/search/classic/stoppers/stoppers.cc
index 201faa23b8..5cf4ecd092 100644
--- a/src/mcts/stoppers/stoppers.cc
+++ b/src/search/classic/stoppers/stoppers.cc
@@ -25,12 +25,12 @@
   Program grant you additional permission to convey the resulting work.
 */
 
-#include "mcts/stoppers/stoppers.h"
+#include "search/classic/stoppers/stoppers.h"
 
-#include "mcts/node.h"
-#include "neural/cache.h"
+#include <cmath>
 
 namespace lczero {
+namespace classic {
 
 ///////////////////////////
 // ChainedSearchStopper
@@ -91,23 +91,21 @@ bool PlayoutsStopper::ShouldStop(const IterationStats& stats,
 // MemoryWatchingStopper
 ///////////////////////////
 
-namespace {
-const size_t kAvgNodeSize =
-    sizeof(Node) + MemoryWatchingStopper::kAvgMovesPerPosition * sizeof(Edge);
-const size_t kAvgCacheItemSize =
-    NNCache::GetItemStructSize() + sizeof(CachedNNRequest) +
-    sizeof(CachedNNRequest::IdxAndProb) *
-        MemoryWatchingStopper::kAvgMovesPerPosition;
-}  // namespace
-
-MemoryWatchingStopper::MemoryWatchingStopper(int cache_size, int ram_limit_mb,
+MemoryWatchingStopper::MemoryWatchingStopper(int ram_limit_mb,
+                                             size_t total_memory,
+                                             size_t avg_node_size,
+                                             uint32_t nodes,
                                              bool populate_remaining_playouts)
     : VisitsStopper(
-          (ram_limit_mb * 1000000LL - cache_size * kAvgCacheItemSize) /
-              kAvgNodeSize,
+          [&]() -> size_t {
+            const auto ram_limit = ram_limit_mb * 1000000LL;
+            const auto nodes_memory = avg_node_size * nodes;
+            if (ram_limit + nodes_memory < total_memory) return 0;
+            return (ram_limit + nodes_memory - total_memory) / avg_node_size;
+          }(),
           populate_remaining_playouts) {
-  LOGFILE << "RAM limit " << ram_limit_mb << "MB. Cache takes "
-          << cache_size * kAvgCacheItemSize / 1000000
+  LOGFILE << "RAM limit " << ram_limit_mb << "MB. Memory allocated is "
+          << (total_memory - avg_node_size * nodes) / 1000000
           << "MB. Remaining memory is enough for " << GetVisitsLimit()
           << " nodes.";
 }
@@ -267,4 +265,5 @@ bool SmartPruningStopper::ShouldStop(const IterationStats& stats,
   return false;
 }
 
+}  // namespace classic
 }  // namespace lczero
diff --git a/src/mcts/stoppers/stoppers.h b/src/search/classic/stoppers/stoppers.h
similarity index 95%
rename from src/mcts/stoppers/stoppers.h
rename to src/search/classic/stoppers/stoppers.h
index f813c4daf1..7232d8000e 100644
--- a/src/mcts/stoppers/stoppers.h
+++ b/src/search/classic/stoppers/stoppers.h
@@ -30,10 +30,10 @@
 #include <optional>
 #include <vector>
 
-#include "mcts/node.h"
-#include "mcts/stoppers/timemgr.h"
+#include "search/classic/stoppers/timemgr.h"
 
 namespace lczero {
+namespace classic {
 
 // Combines multiple stoppers into one.
 class ChainedSearchStopper : public SearchStopper {
@@ -54,7 +54,7 @@ class ChainedSearchStopper : public SearchStopper {
 class VisitsStopper : public SearchStopper {
  public:
   VisitsStopper(int64_t limit, bool populate_remaining_playouts)
-    : nodes_limit_(limit ? limit : 4000000000ll),
+      : nodes_limit_(limit),
         populate_remaining_playouts_(populate_remaining_playouts) {}
   int64_t GetVisitsLimit() const { return nodes_limit_; }
   bool ShouldStop(const IterationStats&, StoppersHints*) override;
@@ -84,7 +84,8 @@ class MemoryWatchingStopper : public VisitsStopper {
  public:
   // Must be in sync with description at kRamLimitMbId.
   static constexpr size_t kAvgMovesPerPosition = 30;
-  MemoryWatchingStopper(int cache_size, int ram_limit_mb,
+  MemoryWatchingStopper(int ram_limit_mb, size_t total_memory,
+                        size_t avg_node_size, uint32_t nodes,
                         bool populate_remaining_playouts);
 };
 
@@ -151,4 +152,5 @@ class SmartPruningStopper : public SearchStopper {
   std::optional<int64_t> first_eval_time_ GUARDED_BY(mutex_);
 };
 
+}  // namespace classic
 }  // namespace lczero
diff --git a/src/mcts/stoppers/timemgr.cc b/src/search/classic/stoppers/timemgr.cc
similarity index 94%
rename from src/mcts/stoppers/timemgr.cc
rename to src/search/classic/stoppers/timemgr.cc
index cd75658c72..41cd096dd5 100644
--- a/src/mcts/stoppers/timemgr.cc
+++ b/src/search/classic/stoppers/timemgr.cc
@@ -25,11 +25,12 @@
   Program grant you additional permission to convey the resulting work.
 */
 
-#include "mcts/stoppers/timemgr.h"
+#include "search/classic/stoppers/timemgr.h"
 
-#include "mcts/stoppers/stoppers.h"
+#include "search/classic/stoppers/stoppers.h"
 
 namespace lczero {
+namespace classic {
 
 StoppersHints::StoppersHints() { Reset(); }
 
@@ -64,4 +65,5 @@ void StoppersHints::Reset() {
   estimated_nps_.reset();
 }
 
+}  // namespace classic
 }  // namespace lczero
\ No newline at end of file
diff --git a/src/mcts/stoppers/timemgr.h b/src/search/classic/stoppers/timemgr.h
similarity index 91%
rename from src/mcts/stoppers/timemgr.h
rename to src/search/classic/stoppers/timemgr.h
index 60a1097863..24fec21573 100644
--- a/src/mcts/stoppers/timemgr.h
+++ b/src/search/classic/stoppers/timemgr.h
@@ -33,10 +33,10 @@
 #include <vector>
 
 #include "chess/uciloop.h"
-#include "mcts/node.h"
 #include "utils/optionsdict.h"
 
 namespace lczero {
+namespace classic {
 
 // Various statistics that search sends to stoppers for their stopping decision.
 // It is expected that this structure will grow.
@@ -105,7 +105,11 @@ class TimeManager {
  public:
   virtual ~TimeManager() = default;
   virtual std::unique_ptr<SearchStopper> GetStopper(const GoParams& params,
-                                                    const NodeTree& tree) = 0;
+                                                    const Position& position,
+                                                    size_t total_memory,
+                                                    size_t avg_node_size,
+                                                    uint32_t nodes) = 0;
 };
 
+}  // namespace classic
 }  // namespace lczero
diff --git a/src/search/classic/wrapper.cc b/src/search/classic/wrapper.cc
new file mode 100644
index 0000000000..8ba6f0fd49
--- /dev/null
+++ b/src/search/classic/wrapper.cc
@@ -0,0 +1,164 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2025 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include "chess/gamestate.h"
+#include "search/classic/search.h"
+#include "search/classic/stoppers/factory.h"
+#include "search/register.h"
+#include "search/search.h"
+#include "neural/shared_params.h"
+#include "utils/trace.h"
+
+namespace lczero {
+namespace classic {
+namespace {
+
+const OptionId kThreadsOptionId{
+    {.long_flag = "threads",
+     .uci_option = "Threads",
+     .help_text =
+         "Number of (CPU) worker threads to use, 0 for the backend default.",
+     .short_flag = 't',
+     .visibility = OptionId::kAlwaysVisible}};
+const OptionId kClearTree{
+    {.long_flag = "",
+     .uci_option = "ClearTree",
+     .help_text = "Clear the tree before the next search.",
+     .visibility = OptionId::kProOnly}};
+
+class ClassicSearch : public SearchBase {
+ public:
+  ClassicSearch(UciResponder* responder, const OptionsDict* options)
+      : SearchBase(responder), options_(options) {}
+
+ private:
+  void NewGame() override;
+  void SetPosition(const GameState& pos) override;
+  void StartSearch(const GoParams&) override;
+  void StartClock() override {
+    move_start_time_ = std::chrono::steady_clock::now();
+  }
+  void WaitSearch() override {
+    if (search_) search_->Wait();
+  }
+  void StopSearch() override {
+    if (search_) search_->Stop();
+  }
+  void AbortSearch() override {
+    if (search_) search_->Abort();
+  }
+
+  const OptionsDict* options_;
+  std::unique_ptr<TimeManager> time_manager_;
+  std::unique_ptr<Search> search_;
+  std::unique_ptr<NodeTree> tree_;
+  std::optional<std::chrono::steady_clock::time_point> move_start_time_;
+};
+
+MoveList StringsToMovelist(const std::vector<std::string>& moves,
+                           const ChessBoard& board) {
+  MoveList result;
+  if (moves.size()) {
+    result.reserve(moves.size());
+    const auto legal_moves = board.GenerateLegalMoves();
+    for (const auto& move : moves) {
+      const Move m = board.ParseMove(move);
+      if (std::find(legal_moves.begin(), legal_moves.end(), m) !=
+          legal_moves.end()) {
+        result.emplace_back(m);
+      }
+    }
+    if (result.empty()) throw Exception("No legal searchmoves.");
+  }
+  return result;
+}
+
+void ClassicSearch::NewGame() {
+  LCTRACE_FUNCTION_SCOPE;
+  search_.reset();
+  tree_.reset();
+  time_manager_ = MakeTimeManager(*options_);
+}
+
+void ClassicSearch::SetPosition(const GameState& pos) {
+  LCTRACE_FUNCTION_SCOPE;
+  if (!tree_) tree_ = std::make_unique<NodeTree>();
+  const bool is_same_game = tree_->ResetToPosition(pos);
+  if (!is_same_game) time_manager_ = MakeTimeManager(*options_);
+}
+
+void ClassicSearch::StartSearch(const GoParams& params) {
+  LCTRACE_FUNCTION_SCOPE;
+  auto forwarder =
+      std::make_unique<NonOwningUciRespondForwarder>(uci_responder_);
+  if (options_->Get<Button>(kClearTree).TestAndReset()) tree_->TrimTreeAtHead();
+
+  const auto cache_size =
+      options_->Get<int>(SharedBackendParams::kNNCacheSizeId);
+  const size_t kAvgNodeSize =
+      sizeof(Node) + MemoryWatchingStopper::kAvgMovesPerPosition * sizeof(Edge);
+  const size_t kAvgCacheItemSize =
+      3 * sizeof(float) + sizeof(std::unique_ptr<float[]>) +
+      sizeof(float[MemoryWatchingStopper::kAvgMovesPerPosition]);
+  size_t total_memory = tree_.get()->GetCurrentHead()->GetN() * kAvgNodeSize +
+                        cache_size * kAvgCacheItemSize;
+  auto stopper = time_manager_->GetStopper(
+      params, tree_.get()->HeadPosition(), total_memory, kAvgNodeSize,
+      tree_.get()->GetCurrentHead()->GetN());
+  search_ = std::make_unique<Search>(
+      *tree_, backend_, std::move(forwarder),
+      StringsToMovelist(params.searchmoves, tree_->HeadPosition().GetBoard()),
+      *move_start_time_, std::move(stopper), params.infinite, params.ponder,
+      *options_, syzygy_tb_);
+
+  LOGFILE << "Timer started at "
+          << FormatTime(SteadyClockToSystemClock(*move_start_time_));
+  search_->StartThreads(options_->Get<int>(kThreadsOptionId));
+}
+
+class ClassicSearchFactory : public SearchFactory {
+  std::string_view GetName() const override { return "classic"; }
+  std::unique_ptr<SearchBase> CreateSearch(
+      UciResponder* responder, const OptionsDict* options) const override {
+    LCTRACE_FUNCTION_SCOPE;
+    return std::make_unique<ClassicSearch>(responder, options);
+  }
+
+  void PopulateParams(OptionsParser* parser) const override {
+    parser->Add<IntOption>(kThreadsOptionId, 0, 128) = 0;
+    SearchParams::Populate(parser);
+    PopulateTimeManagementOptions(RunType::kUci, parser);
+
+    parser->Add<ButtonOption>(kClearTree);
+  }
+};
+
+REGISTER_SEARCH(ClassicSearchFactory);
+
+}  // namespace
+}  // namespace classic
+}  // namespace lczero
diff --git a/src/search/dag_classic/node.cc b/src/search/dag_classic/node.cc
new file mode 100644
index 0000000000..1ad4762f0e
--- /dev/null
+++ b/src/search/dag_classic/node.cc
@@ -0,0 +1,962 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2018 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include "search/dag_classic/node.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstring>
+#include <iostream>
+#include <fstream>
+#include <list>
+#include <sstream>
+#include <thread>
+#include <unordered_set>
+
+#include "utils/exception.h"
+#include "utils/hashcat.h"
+
+namespace lczero {
+namespace dag_classic {
+
+/////////////////////////////////////////////////////////////////////////
+// Edge
+/////////////////////////////////////////////////////////////////////////
+
+Move Edge::GetMove(bool as_opponent) const {
+  if (!as_opponent) return move_;
+  Move m = move_;
+  m.Flip();
+  return m;
+}
+
+// Policy priors (P) are stored in a compressed 16-bit format.
+//
+// Source values are 32-bit floats:
+// * bit 31 is sign (zero means positive)
+// * bit 30 is sign of exponent (zero means nonpositive)
+// * bits 29..23 are value bits of exponent
+// * bits 22..0 are significand bits (plus a "virtual" always-on bit: s ∈ [1,2))
+// The number is then sign * 2^exponent * significand, usually.
+// See https://www.h-schmidt.net/FloatConverter/IEEE754.html for details.
+//
+// In compressed 16-bit value we store bits 27..12:
+// * bit 31 is always off as values are always >= 0
+// * bit 30 is always off as values are always < 2
+// * bits 29..28 are only off for values < 4.6566e-10, assume they are always on
+// * bits 11..0 are for higher precision, they are dropped leaving only 11 bits
+//     of precision
+//
+// When converting to compressed format, bit 11 is added to in order to make it
+// a rounding rather than truncation.
+//
+// Out of 65556 possible values, 2047 are outside of [0,1] interval (they are in
+// interval (1,2)). This is fine because the values in [0,1] are skewed towards
+// 0, which is also exactly how the components of policy tend to behave (since
+// they add up to 1).
+
+// If the two assumed-on exponent bits (3<<28) are in fact off, the input is
+// rounded up to the smallest value with them on. We accomplish this by
+// subtracting the two bits from the input and checking for a negative result
+// (the subtraction works despite crossing from exponent to significand). This
+// is combined with the round-to-nearest addition (1<<11) into one op.
+void Edge::SetP(float p) {
+  assert(0.0f <= p && p <= 1.0f);
+  constexpr int32_t roundings = (1 << 11) - (3 << 28);
+  int32_t tmp;
+  std::memcpy(&tmp, &p, sizeof(float));
+  tmp += roundings;
+  p_ = (tmp < 0) ? 0 : static_cast<uint16_t>(tmp >> 12);
+}
+
+float Edge::GetP() const {
+  // Reshift into place and set the assumed-set exponent bits.
+  uint32_t tmp = (static_cast<uint32_t>(p_) << 12) | (3 << 28);
+  float ret;
+  std::memcpy(&ret, &tmp, sizeof(uint32_t));
+  return ret;
+}
+
+std::string Edge::DebugString() const {
+  std::ostringstream oss;
+  oss << "Move: " << move_.ToString(true) << " p_: " << p_
+      << " GetP: " << GetP();
+  return oss.str();
+}
+
+std::unique_ptr<Edge[]> Edge::FromMovelist(const MoveList& moves) {
+  std::unique_ptr<Edge[]> edges = std::make_unique<Edge[]>(moves.size());
+  auto* edge = edges.get();
+  for (const auto move : moves) edge++->move_ = move;
+  return edges;
+}
+
+/////////////////////////////////////////////////////////////////////////
+// LowNode + Node
+/////////////////////////////////////////////////////////////////////////
+
+void Node::Trim() {
+  wl_ = 0.0f;
+
+  UnsetLowNode();
+  // sibling_
+
+  d_ = 0.0f;
+  m_ = 0.0f;
+  n_ = 0;
+  n_in_flight_.store(0, std::memory_order_release);
+
+  // edge_
+
+  // index_
+
+  terminal_type_ = Terminal::NonTerminal;
+  lower_bound_ = GameResult::BLACK_WON;
+  upper_bound_ = GameResult::WHITE_WON;
+  repetition_ = false;
+}
+
+LowNode::~LowNode() {
+  NodeGarbageCollector::Instance().AddToGcQueue(child_);
+}
+
+Node::~Node() {
+  NodeGarbageCollector::Instance().AddToGcQueue(sibling_);
+  UnsetLowNode();
+}
+
+Node* Node::GetChild() const {
+  if (!low_node_) return nullptr;
+  return low_node_->GetChild()->get();
+}
+
+bool Node::HasChildren() const { return low_node_ && low_node_->HasChildren(); }
+
+float Node::GetVisitedPolicy() const {
+  float sum = 0.0f;
+  for (auto* node : VisitedNodes()) sum += node->GetP();
+  return sum;
+}
+
+uint32_t Node::GetNInFlight() const {
+  return n_in_flight_.load(std::memory_order_acquire);
+}
+
+uint32_t Node::GetChildrenVisits() const {
+  return low_node_ ? low_node_->GetChildrenVisits() : 0;
+}
+
+uint32_t Node::GetTotalVisits() const {
+  return low_node_ ? low_node_->GetN() : 0;
+}
+
+const Edge& LowNode::GetEdgeAt(uint16_t index) const { return edges_[index]; }
+
+std::string Node::DebugString() const {
+  std::ostringstream oss;
+  oss << " <Node> This:" << this << " LowNode:" << low_node_.get()
+      << " Index:" << index_ << " Move:" << GetMove().ToString(true)
+      << " Sibling:" << sibling_.get() << " P:" << GetP() << " WL:" << wl_
+      << " D:" << d_ << " M:" << m_ << " N:" << n_ << " N_:" << GetNInFlight()
+      << " Term:" << static_cast<int>(terminal_type_)
+      << " Bounds:" << static_cast<int>(lower_bound_) - 2 << ","
+      << static_cast<int>(upper_bound_) - 2;
+  return oss.str();
+}
+
+std::string LowNode::DebugString() const {
+  std::ostringstream oss;
+  oss << " <LowNode> This:" << this << " Edges:" << edges_.get()
+      << " NumEdges:" << static_cast<int>(num_edges_)
+      << " Child:" << child_.get() << " WL:" << wl_ << " D:" << d_
+      << " M:" << m_ << " N:" << n_ << " NP:" << num_parents_
+      << " Term:" << static_cast<int>(terminal_type_)
+      << " Bounds:" << static_cast<int>(lower_bound_) - 2 << ","
+      << static_cast<int>(upper_bound_) - 2;
+  return oss.str();
+}
+
+void Edge::SortEdges(Edge* edges, int num_edges) {
+  // Sorting on raw p_ is the same as sorting on GetP() as a side effect of
+  // the encoding, and its noticeably faster.
+  std::sort(edges, (edges + num_edges),
+            [](const Edge& a, const Edge& b) { return a.p_ > b.p_; });
+}
+
+void LowNode::MakeTerminal(GameResult result, float plies_left, Terminal type) {
+  SetBounds(result, result);
+  terminal_type_ = type;
+  m_ = plies_left;
+  if (result == GameResult::DRAW) {
+    wl_ = 0.0f;
+    d_ = 1.0f;
+  } else if (result == GameResult::WHITE_WON) {
+    wl_ = 1.0f;
+    d_ = 0.0f;
+  } else if (result == GameResult::BLACK_WON) {
+    wl_ = -1.0f;
+    d_ = 0.0f;
+  }
+
+  assert(WLDMInvariantsHold());
+}
+
+void LowNode::MakeNotTerminal(const Node* node) {
+  assert(edges_);
+  if (!IsTerminal()) return;
+
+  terminal_type_ = Terminal::NonTerminal;
+  lower_bound_ = GameResult::BLACK_WON;
+  upper_bound_ = GameResult::WHITE_WON;
+  n_ = 0;
+  wl_ = 0.0;
+  d_ = 0.0;
+  m_ = 0.0;
+
+  // Include children too.
+  if (node->GetNumEdges() > 0) {
+    for (const auto& child : node->Edges()) {
+      const auto n = child.GetN();
+      if (n > 0) {
+        n_ += n;
+        // Flip Q for opponent.
+        // Default values don't matter as n is > 0.
+        wl_ += child.GetWL(0.0f) * n;
+        d_ += child.GetD(0.0f) * n;
+        m_ += child.GetM(0.0f) * n;
+      }
+    }
+
+    // Recompute with current eval (instead of network's) and children's eval.
+    wl_ /= n_;
+    d_ /= n_;
+    m_ /= n_;
+  }
+
+  assert(WLDMInvariantsHold());
+}
+
+void LowNode::SetBounds(GameResult lower, GameResult upper) {
+  lower_bound_ = lower;
+  upper_bound_ = upper;
+}
+
+uint8_t Node::GetNumEdges() const {
+  return low_node_ ? low_node_->GetNumEdges() : 0;
+}
+
+void Node::MakeTerminal(GameResult result, float plies_left, Terminal type) {
+  SetBounds(result, result);
+  terminal_type_ = type;
+  m_ = plies_left;
+  if (result == GameResult::DRAW) {
+    wl_ = 0.0f;
+    d_ = 1.0f;
+  } else if (result == GameResult::WHITE_WON) {
+    wl_ = 1.0f;
+    d_ = 0.0f;
+  } else if (result == GameResult::BLACK_WON) {
+    wl_ = -1.0f;
+    d_ = 0.0f;
+    // Terminal losses have no uncertainty and no reason for their U value to be
+    // comparable to another non-loss choice. Force this by clearing the policy.
+    SetP(0.0f);
+  }
+
+  assert(WLDMInvariantsHold());
+}
+
+void Node::MakeNotTerminal(bool also_low_node) {
+  // At least one of node and low node pair needs to be a terminal.
+  if (!IsTerminal() &&
+      (!also_low_node || !low_node_ || !low_node_->IsTerminal()))
+    return;
+
+  terminal_type_ = Terminal::NonTerminal;
+  repetition_ = false;
+  if (low_node_) {  // Two-fold or derived terminal.
+    // Revert low node first.
+    if (also_low_node && low_node_) low_node_->MakeNotTerminal(this);
+
+    auto [lower_bound, upper_bound] = low_node_->GetBounds();
+    lower_bound_ = -upper_bound;
+    upper_bound_ = -lower_bound;
+    n_ = low_node_->GetN();
+    wl_ = -low_node_->GetWL();
+    d_ = low_node_->GetD();
+    m_ = low_node_->GetM() + 1;
+  } else {  // Real terminal.
+    lower_bound_ = GameResult::BLACK_WON;
+    upper_bound_ = GameResult::WHITE_WON;
+    n_ = 0.0f;
+    wl_ = 0.0f;
+    d_ = 0.0f;
+    m_ = 0.0f;
+  }
+
+  assert(WLDMInvariantsHold());
+}
+
+void Node::SetBounds(GameResult lower, GameResult upper) {
+  lower_bound_ = lower;
+  upper_bound_ = upper;
+}
+
+bool Node::TryStartScoreUpdate() {
+  if (n_ > 0) {
+    n_in_flight_.fetch_add(1, std::memory_order_acq_rel);
+    return true;
+  } else {
+    uint32_t expected_n_if_flight_ = 0;
+    return n_in_flight_.compare_exchange_strong(expected_n_if_flight_, 1,
+                                              std::memory_order_acq_rel);
+  }
+}
+
+void Node::CancelScoreUpdate(uint32_t multivisit) {
+  assert(GetNInFlight() >= (uint32_t)multivisit);
+  n_in_flight_.fetch_sub(multivisit, std::memory_order_acq_rel);
+}
+
+void LowNode::FinalizeScoreUpdate(float v, float d, float m,
+                                  uint32_t multivisit) {
+  assert(edges_);
+  // Recompute Q.
+  wl_ += multivisit * (v - wl_) / (n_ + multivisit);
+  d_ += multivisit * (d - d_) / (n_ + multivisit);
+  m_ += multivisit * (m - m_) / (n_ + multivisit);
+
+  assert(WLDMInvariantsHold());
+
+  // Increment N.
+  n_ += multivisit;
+}
+
+void LowNode::AdjustForTerminal(float v, float d, float m,
+                                uint32_t multivisit) {
+  assert(static_cast<uint32_t>(multivisit) <= n_);
+
+  // Recompute Q.
+  wl_ += multivisit * v / n_;
+  d_ += multivisit * d / n_;
+  m_ += multivisit * m / n_;
+
+  assert(WLDMInvariantsHold());
+}
+
+void Node::FinalizeScoreUpdate(float v, float d, float m, uint32_t multivisit) {
+  // Recompute Q.
+  wl_ += multivisit * (v - wl_) / (n_ + multivisit);
+  d_ += multivisit * (d - d_) / (n_ + multivisit);
+  m_ += multivisit * (m - m_) / (n_ + multivisit);
+
+  assert(WLDMInvariantsHold());
+
+  // Increment N.
+  n_ += multivisit;
+  // Decrement virtual loss.
+  assert(GetNInFlight() >= (uint32_t)multivisit);
+  n_in_flight_.fetch_sub(multivisit, std::memory_order_acq_rel);
+}
+
+void Node::AdjustForTerminal(float v, float d, float m, uint32_t multivisit) {
+  assert(static_cast<uint32_t>(multivisit) <= n_);
+
+  // Recompute Q.
+  wl_ += multivisit * v / n_;
+  d_ += multivisit * d / n_;
+  m_ += multivisit * m / n_;
+
+  assert(WLDMInvariantsHold());
+}
+
+void Node::IncrementNInFlight(uint32_t multivisit) {
+  n_in_flight_.fetch_add(multivisit, std::memory_order_acq_rel);
+}
+
+void LowNode::ReleaseChildren() {
+  NodeGarbageCollector::Instance().AddToGcQueue(child_);
+}
+
+void LowNode::ReleaseChildrenExceptOne(Node* node_to_save) {
+  auto& ngc = NodeGarbageCollector::Instance();
+  // Stores node which will have to survive (or nullptr if it's not found).
+  std::unique_ptr<Node> saved_node;
+  // Pointer to atomic_unique_ptr, so that we could move from it.
+  for (auto node = &child_; *node; node = (*node)->GetSibling()) {
+    // If current node is the one that we have to save.
+    if (node->get() == node_to_save) {
+      // Kill all remaining siblings.
+      ngc.AddToGcQueue(*(*node)->GetSibling());
+      // Save the node, and take the ownership from the unique_ptr.
+      saved_node.reset(node->release());
+      break;
+    }
+  }
+  // Make saved node the only child. (kills previous siblings).
+  ngc.AddToGcQueue(child_);
+  child_ = std::move(saved_node);
+}
+
+void Node::ReleaseChildrenExceptOne(Node* node_to_save) const {
+  // Sometime we have no graph yet or a reverted terminal without low node.
+  if (low_node_) {
+    low_node_->ReleaseChildrenExceptOne(node_to_save);
+  }
+}
+
+void Node::SetLowNode(std::shared_ptr<LowNode> low_node) {
+  assert(!low_node_);
+  low_node->AddParent();
+  low_node_ = low_node;
+}
+void Node::UnsetLowNode() {
+  if (low_node_) low_node_->RemoveParent();
+  low_node_.reset();
+}
+
+#ifndef NDEBUG
+namespace {
+static Node::VisitorId::storage current_visitor_id = 0;
+}
+
+Node::VisitorId::VisitorId() {
+  id_ = ++current_visitor_id;
+  if (id_ == 0)
+    id_ = ++current_visitor_id;
+}
+
+Node::VisitorId::~VisitorId() {
+  assert(current_visitor_id == id_);
+}
+
+bool LowNode::Visit(Node::VisitorId::type id) {
+  if (visitor_id_ == id)
+    return false;
+  visitor_id_ = id;
+  return true;
+}
+
+template<typename VisitorType, typename EdgeVisitorType>
+static void TreeWalk(const Node* node, bool as_opponent,
+                     Node::VisitorId::type id,
+                     VisitorType visitor, EdgeVisitorType edge) {
+  const std::shared_ptr<LowNode>& low_node = node->GetLowNode();
+  if (!low_node || !low_node->Visit(id)) {
+    return;
+  }
+
+  visitor(low_node.get(), as_opponent);
+
+  for (auto& child_edge : node->Edges()) {
+    auto child = child_edge.node();
+    if (child == nullptr) {
+      break;
+    }
+    edge(child, as_opponent, low_node.get());
+  }
+
+  for (auto& child_edge : node->Edges()) {
+    auto child = child_edge.node();
+    if (child == nullptr) {
+      return;
+    }
+    TreeWalk(child, !as_opponent, id, visitor, edge);
+  }
+}
+
+static std::string PtrToNodeName(const void* ptr) {
+  std::ostringstream oss;
+  oss << "n_" << ptr;
+  return oss.str();
+}
+
+template<typename VisitorType, typename EdgeVisitorType>
+static void TreeWalk(const Node* node, bool as_opponent,
+                     VisitorType visitor, EdgeVisitorType edge) {
+  Node::VisitorId id{};
+  edge(node, as_opponent, nullptr);
+  TreeWalk(node, !as_opponent, id, visitor, edge);
+}
+
+void LowNode::DotNodeString(std::ofstream& oss) const {
+  oss << PtrToNodeName(this) << " ["
+      << "shape=box";
+  // Adjust formatting to limit node size.
+  oss << std::fixed << std::setprecision(3);
+  oss << ",label=\""     //
+      << std::showpos    //
+      << "WL=" << wl_    //
+      << std::noshowpos  //
+      << "\\lD=" << d_ << "\\lM=" << m_ << "\\lN=" << n_ << "\\l\"";
+  // Set precision for tooltip.
+  oss << std::fixed << std::showpos << std::setprecision(5);
+  oss << ",tooltip=\""   //
+      << std::showpos    //
+      << "WL=" << wl_    //
+      << std::noshowpos  //
+      << "\\nD=" << d_ << "\\nM=" << m_ << "\\nN=" << n_
+      << "\\nNP=" << num_parents_
+      << "\\nTerm=" << static_cast<int>(terminal_type_)  //
+      << std::showpos                                    //
+      << "\\nBounds=" << static_cast<int>(lower_bound_) - 2 << ","
+      << static_cast<int>(upper_bound_) - 2
+      << std::noshowpos                             //
+      << "\\n\\nThis=" << this << "\\nEdges=" << edges_.get()
+      << "\\nNumEdges=" << static_cast<int>(num_edges_)
+      << "\\nChild=" << child_.get() << "\\n\"";
+  oss << "];" << std::endl;
+}
+
+void Node::DotEdgeString(std::ofstream& oss, bool as_opponent, const LowNode* parent) const {
+  oss << (parent == nullptr ? "top" : PtrToNodeName(parent)) << " -> "
+      << (low_node_ ? PtrToNodeName(low_node_.get()) : PtrToNodeName(this))
+      << " [";
+  oss << "label=\""
+      << (parent == nullptr ? "N/A" : GetMove(as_opponent).ToString(true))
+      << "\\lN=" << n_ << "\\lN_=" << GetNInFlight();
+  oss << "\\l\"";
+  // Set precision for tooltip.
+  oss << std::fixed << std::setprecision(5);
+  oss << ",labeltooltip=\""
+      << "P=" << (parent == nullptr ? 0.0f : GetP())  //
+      << std::showpos                                 //
+      << "\\nWL= " << wl_                             //
+      << std::noshowpos                               //
+      << "\\nD=" << d_ << "\\nM=" << m_ << "\\nN=" << n_
+      << "\\nN_=" << GetNInFlight()
+      << "\\nTerm=" << static_cast<int>(terminal_type_)  //
+      << std::showpos                                    //
+      << "\\nBounds=" << static_cast<int>(lower_bound_) - 2 << ","
+      << static_cast<int>(upper_bound_) - 2 << "\\n\\nThis=" << this  //
+      << std::noshowpos                                               //
+      << "\\nLowNode=" << low_node_.get() << "\\nParent=" << parent
+      << "\\nIndex=" << index_ << "\\nSibling=" << sibling_.get() << "\\n\"";
+  oss << "];" << std::endl;
+}
+
+void Node::DotGraphString(std::ofstream& oss, bool as_opponent) const {
+  oss << "strict digraph {" << std::endl;
+  oss << "edge ["
+      << "headport=n"
+      << ",tooltip=\" \""  // Remove default tooltips from edge parts.
+      << "];" << std::endl;
+  oss << "node ["
+      << "shape=point"    // For fake nodes.
+      << ",style=filled"  // Show tooltip everywhere on the node.
+      << ",fillcolor=ivory"
+      << "];" << std::endl;
+  oss << "ranksep=" << 4.0f * std::log10(GetN()) << std::endl;
+
+  TreeWalk(this, !as_opponent,
+    [&](const LowNode* low_node, bool) {
+      low_node->DotNodeString(oss);
+    },
+    [&](const Node* node, bool as_opponent, const LowNode* parent) {
+      node->DotEdgeString(oss, as_opponent, parent);
+    });
+
+  oss << "}" << std::endl;
+}
+
+bool Node::ZeroNInFlight() const {
+  size_t nonzero_node_count = 0;
+  TreeWalk(this, false,
+    [](const LowNode*, bool) {},
+    [&](const Node* node, bool, const LowNode*) {
+      if (node->GetNInFlight() > 0) [[unlikely]] {
+        CERR << node->DebugString() << std::endl;
+        ++nonzero_node_count;
+      }
+    });
+
+  if (nonzero_node_count > 0) {
+    CERR << "GetNInFlight() is nonzero on " << nonzero_node_count
+              << " nodes" << std::endl;
+    return false;
+  }
+
+  return true;
+}
+#endif
+
+void Node::SortEdges() const {
+  assert(low_node_);
+  low_node_->SortEdges();
+}
+
+static constexpr float wld_tolerance = 0.000001f;
+static constexpr float m_tolerance = 0.000001f;
+
+static bool WLDMInvariantsHold(float wl, float d, float m) {
+  return -(1.0f + wld_tolerance) < wl && wl < (1.0f + wld_tolerance) &&  //
+         -(0.0f + wld_tolerance) < d && d < (1.0f + wld_tolerance) &&    //
+         -(0.0f + m_tolerance) < m &&                                    //
+         std::abs(wl) + std::abs(d) < (1.0f + wld_tolerance);
+}
+
+bool Node::WLDMInvariantsHold() const {
+  if (dag_classic::WLDMInvariantsHold(GetWL(), GetD(), GetM())) return true;
+
+  std::cerr << DebugString() << std::endl;
+
+  return false;
+}
+
+bool LowNode::WLDMInvariantsHold() const {
+  if (dag_classic::WLDMInvariantsHold(GetWL(), GetD(), GetM())) return true;
+
+  std::cerr << DebugString() << std::endl;
+
+  return false;
+}
+
+/////////////////////////////////////////////////////////////////////////
+// EdgeAndNode
+/////////////////////////////////////////////////////////////////////////
+
+std::string EdgeAndNode::DebugString() const {
+  if (!edge_) return "(no edge)";
+  return edge_->DebugString() + " " +
+         (node_ ? node_->DebugString() : "(no node)");
+}
+
+/////////////////////////////////////////////////////////////////////////
+// NodeTree
+/////////////////////////////////////////////////////////////////////////
+
+NodeTree::~NodeTree() {
+  auto& ngc = NodeGarbageCollector::Instance();
+  ngc.AddToGcQueue(gamebegin_node_);
+  ngc.NotifyThreadGoingSleep();
+  // Start garbage collection now because we delete everything.
+  ngc.Start();
+}
+
+void NodeTree::MakeMove(Move move) {
+  Node* new_head = nullptr;
+  for (auto& n : current_head_->Edges()) {
+    if (n.GetMove() == move) {
+      new_head = n.GetOrSpawnNode(current_head_);
+      // Ensure head is not terminal, so search can extend or visit children of
+      // "terminal" positions, e.g., WDL hits, converted terminals, 3-fold draw.
+      if (new_head->IsTerminal()) new_head->MakeNotTerminal();
+      break;
+    }
+  }
+  // Release nodes from last move if any.
+  current_head_->ReleaseChildrenExceptOne(new_head);
+  new_head = current_head_->GetChild();
+  current_head_ =
+      new_head ? new_head : current_head_->CreateSingleChildNode(move);
+  history_.Append(move);
+  moves_.push_back(move);
+}
+
+void NodeTree::TrimTreeAtHead() {
+  current_head_->Trim();
+  // Flush the thread local destruction queue.
+  NodeGarbageCollector::Instance().NotifyThreadGoingSleep();
+}
+
+bool NodeTree::ResetToPosition(const GameState& pos) {
+  if (gamebegin_node_ && (history_.Starting() != pos.startpos)) {
+    // Completely different position.
+    DeallocateTree();
+  }
+
+  if (!gamebegin_node_) {
+    gamebegin_node_ = std::make_unique<Node>(0);
+  }
+
+  history_.Reset(pos.startpos);
+  moves_.clear();
+
+  Node* old_head = current_head_;
+  current_head_ = gamebegin_node_.get();
+  bool seen_old_head = (gamebegin_node_.get() == old_head);
+  for (const Move m : pos.moves) {
+    MakeMove(m);
+    if (old_head == current_head_) seen_old_head = true;
+  }
+
+  // MakeMove guarantees that no siblings exist; but, if we didn't see the old
+  // head, it means we might have a position that was an ancestor to a
+  // previously searched position, which means that the current_head_ might
+  // retain old n_ and q_ (etc) data, even though its old children were
+  // previously trimmed; we need to reset current_head_ in that case.
+  if (!seen_old_head) TrimTreeAtHead();
+  NodeGarbageCollector::Instance().NotifyThreadGoingSleep();
+  return seen_old_head;
+}
+
+bool NodeTree::ResetToPosition(const std::string& starting_fen,
+                               const std::vector<std::string>& moves) {
+  GameState state;
+  state.startpos = Position::FromFen(starting_fen);
+  ChessBoard cur_board = state.startpos.GetBoard();
+  state.moves.reserve(moves.size());
+  for (const auto& move : moves) {
+    Move m = cur_board.ParseMove(move);
+    state.moves.push_back(m);
+    cur_board.ApplyMove(m);
+    cur_board.Mirror();
+  }
+  return ResetToPosition(state);
+}
+
+void NodeTree::DeallocateTree() {
+  NodeGarbageCollector::Instance().AddToGcQueue(gamebegin_node_);
+  current_head_ = nullptr;
+}
+
+NodeGarbageCollector::NodeGarbageCollector() :
+  gc_thread_{[this]() {GCThread();}} {
+}
+
+template<typename UniquePtr>
+void NodeGarbageCollector::AddToGcQueue(UniquePtr& shared_node) {
+  std::unique_ptr<Node> node(shared_node.release());
+  if (ShouldQueue(node)) {
+    LocalWork().emplace_back(std::move(node));
+  }
+}
+
+NodeGarbageCollector::~NodeGarbageCollector() {
+  state_.store(Exit, std::memory_order_release);
+#ifndef NO_STD_ATOMIC_WAIT
+  state_.notify_all();
+#else
+  {
+    Mutex::Lock lock(state_mutex_);
+    state_signal_.notify_all();
+  }
+#endif
+  gc_thread_.join();
+}
+
+bool NodeGarbageCollector::SetState(State& old, State desired) {
+  bool rv =  state_.compare_exchange_strong(old, desired,
+                                            std::memory_order_acq_rel);
+  if (rv) {
+#ifndef NO_STD_ATOMIC_WAIT
+    state_.notify_all();
+#else
+    Mutex::Lock lock(state_mutex_);
+    state_signal_.notify_all();
+#endif
+  }
+  return rv;
+}
+
+void NodeGarbageCollector::Start() {
+  State s = state_.load(std::memory_order_acquire);
+  do {
+    if (s == Running)
+      break;
+    assert(s != Exit);
+  } while (!SetState(s, Running));
+}
+
+void NodeGarbageCollector::Stop() {
+  State old = Running;
+  SetState(old, GoToSleep);
+}
+
+void NodeGarbageCollector::Abort() {
+  Stop();
+}
+
+NodeGarbageCollector::State NodeGarbageCollector::Wait() const {
+  State s;
+  while ((s = state_.load(std::memory_order_acquire)) != Sleeping) {
+    assert(s != Exit);
+#ifndef NO_STD_ATOMIC_WAIT
+    state_.wait(s, std::memory_order_acquire);
+#else
+    Mutex::Lock lock(state_mutex_);
+    state_signal_.wait(lock.get_raw(), [this, s]() {return s != state_;});
+#endif
+  }
+  return s;
+}
+
+void NodeGarbageCollector::NotifyThreadGoingSleep() {
+  if (LocalWork().empty()) {
+    return;
+  }
+  ReleaseNodesWork new_work;
+  LocalWork().swap(new_work);
+}
+
+bool NodeGarbageCollector::IsActive() const {
+  return state_.load(std::memory_order_acquire) == Running;
+}
+
+bool NodeGarbageCollector::ShouldQueue(std::unique_ptr<Node>& node) const {
+  // We don't want to queue null pointers.
+  if (!node) {
+    return false;
+  }
+
+  // If state is exit, it means thread local queues have been destroyed.
+  State s = state_.load(std::memory_order_acquire);
+  if (s == Exit) {
+    return false;
+  }
+
+  // We directly free the node, if queue is running and we are in the GC thread.
+  // All other queue request should be pushed to the thread local batch.
+  return s != Running || !LocalWork().IsWorker();
+}
+
+void NodeGarbageCollector::GCThread() {
+  auto& shared_work = LocalWork(true);
+  assert(shared_work.IsWorker());
+  State s;
+  while ((s = state_.load(std::memory_order_acquire)) != Exit) {
+    if (s == GoToSleep) {
+      // Signal other threads that we have stopped destruction work.
+      if (SetState(s, Sleeping)) {
+        s = Sleeping;
+      } else {
+        continue;
+      }
+    }
+    if (s == Sleeping) {
+#ifndef NO_STD_ATOMIC_WAIT
+      state_.wait(Sleeping, std::memory_order_acquire);
+#else
+      Mutex::Lock lock(state_mutex_);
+      state_signal_.wait(lock.get_raw(), [this]() {return Sleeping != state_;});
+#endif
+      if (!shared_work.empty()) {
+        // Check for early exit from previous free. The work can be freed
+        // before the batch is full.
+        ReleaseNodesWork new_work(true);
+        new_work.swap(shared_work);
+      }
+      continue;
+    }
+
+    assert(s == Running);
+
+    bool empty = true;
+    std::vector<std::unique_ptr<Node>> nodes;
+    {
+      SpinMutex::Lock lock(mutex_);
+      if (!released_nodes_.empty()) {
+        empty = false;
+        nodes = std::move(released_nodes_.front());
+        released_nodes_.pop_front();
+      }
+    }
+
+    if (!empty) {
+      LOGFILE << "Garbage collection starting.";
+    }
+
+    // Free nodes one by one. LowNode destructor calls AddToGcQueue which allows
+    // recursive destruction terminate before freeing a whole branch.
+    while (!nodes.empty()) {
+      if (!IsActive()) {
+        break;
+      }
+      nodes.pop_back();
+    }
+
+    if (!empty) {
+      LOGFILE << "Garbage collection ending.";
+    }
+
+    // Go to sleep if empty or search stopped.
+    if (empty || !IsActive()) {
+      // Lock is requrired to avoid race between other thread queueing work and
+      // calling Start().
+      SpinMutex::Lock lock(mutex_);
+      // There wasn't enough time to free all nodes. They must go back to the
+      // list.
+      if (!nodes.empty()) {
+        released_nodes_.emplace_front(std::move(nodes));
+      }
+
+      // Going to sleep if the queue is empty.
+      if (released_nodes_.empty()) {
+        State old = Running;
+        SetState(old, Sleeping);
+      }
+    }
+  }
+}
+ReleaseNodesWork::ReleaseNodesWork(bool gc_thread) :
+    is_gc_thread_(gc_thread) {
+  released_nodes_.reserve(kCapacity);
+}
+
+bool ReleaseNodesWork::IsWorker() const {
+  return is_gc_thread_;
+}
+
+void ReleaseNodesWork::emplace_back(std::unique_ptr<Node>&& node) {
+  if (!node) return;
+  released_nodes_.emplace_back(std::forward<std::unique_ptr<Node>>(node));
+  if (released_nodes_.size() == kCapacity) {
+    ReleaseNodesWork new_work(is_gc_thread_);
+    swap(new_work);
+  }
+}
+
+bool ReleaseNodesWork::empty() const {
+  return released_nodes_.empty();
+}
+
+void ReleaseNodesWork::swap(ReleaseNodesWork &other) {
+  assert(IsWorker() == other.IsWorker());
+  std::swap(released_nodes_, other.released_nodes_);
+}
+
+ReleaseNodesWork::~ReleaseNodesWork() {
+  Submit();
+}
+
+void ReleaseNodesWork::Submit() {
+  if (released_nodes_.empty()) {
+    return;
+  }
+  auto& worker = NodeGarbageCollector::Instance();
+  SpinMutex::Lock lock(worker.mutex_);
+  // If this is worker, we have oldest nodes. Keep them at front of the queue.
+  if (IsWorker()) {
+    worker.released_nodes_.emplace_front(std::move(released_nodes_));
+  } else {
+    worker.released_nodes_.emplace_back(std::move(released_nodes_));
+  }
+}
+
+}  // namespace dag_classic
+}  // namespace lczero
diff --git a/src/search/dag_classic/node.h b/src/search/dag_classic/node.h
new file mode 100644
index 0000000000..b74a64a9d4
--- /dev/null
+++ b/src/search/dag_classic/node.h
@@ -0,0 +1,1082 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2018 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#pragma once
+
+#include <absl/container/flat_hash_map.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <iostream>
+#include <memory>
+#include <mutex>
+
+#if __cpp_lib_atomic_wait < 201907L
+#define NO_STD_ATOMIC_WAIT 1
+#include <condition_variable>
+#endif
+
+#include "chess/board.h"
+#include "chess/callbacks.h"
+#include "chess/gamestate.h"
+#include "chess/position.h"
+#include "neural/backend.h"
+#include "utils/mutex.h"
+
+namespace lczero {
+namespace dag_classic {
+
+// Terminology:
+// * Edge - a potential edge with a move and policy information.
+// * Node - an existing edge with number of visits and evaluation.
+// * LowNode - a node with number of visits, evaluation and edges.
+//
+// Storage:
+// * Potential edges are stored in a simple array inside the LowNode as edges_.
+// * Existing edges are stored in a linked list starting with a child_ pointer
+//   in the LowNode and continuing with a sibling_ pointer in each Node.
+// * Existing edges have a copy of their potential edge counterpart, index_
+//   among potential edges and are linked to the target LowNode via the
+//   low_node_ pointer.
+//
+// Example:
+//                                 LowNode
+//                                    |
+//        +-------------+-------------+----------------+--------------+
+//        |              |            |                |              |
+//   Edge 0(Nf3)    Edge 1(Bc5)     Edge 2(a4)     Edge 3(Qxf7)    Edge 4(a3)
+//    (dangling)         |           (dangling)        |           (dangling)
+//                   Node, Q=0.5                    Node, Q=-0.2
+//
+//  Is represented as:
+// +-----------------+
+// | LowNode         |
+// +-----------------+                                        +--------+
+// | edges_          | -------------------------------------> | Edge[] |
+// |                 |    +------------+                      +--------+
+// | child_          | -> | Node       |                      | Nf3    |
+// |                 |    +------------+                      | Bc5    |
+// | ...             |    | edge_      |                      | a4     |
+// |                 |    | index_ = 1 |                      | Qxf7   |
+// |                 |    | q_ = 0.5   |    +------------+    | a3     |
+// |                 |    | sibling_   | -> | Node       |    +--------+
+// |                 |    +------------+    +------------+
+// |                 |                      | edge_      |
+// +-----------------+                      | index_ = 3 |
+//                                          | q_ = -0.2  |
+//                                          | sibling_   | -> nullptr
+//                                          +------------+
+
+// Define __i386__  or __arm__ also for 32 bit Windows.
+#if defined(_M_IX86)
+#define __i386__
+#endif
+#if defined(_M_ARM) && !defined(_M_AMD64)
+#define __arm__
+#endif
+
+// Atomic unique_ptr based on the public domain code from
+// https://stackoverflow.com/a/42811152 .
+template <class T>
+class atomic_unique_ptr {
+  using pointer = T*;
+  using unique_pointer = std::unique_ptr<T>;
+
+ public:
+  // Manage no pointer.
+  constexpr atomic_unique_ptr() noexcept : ptr() {}
+
+  // Make pointer @p managed.
+  explicit atomic_unique_ptr(pointer p) noexcept : ptr(p) {}
+
+  // Move the managed pointer ownership from another atomic_unique_ptr.
+  atomic_unique_ptr(atomic_unique_ptr&& p) noexcept : ptr(p.release()) {}
+  // Move the managed pointer ownership from another atomic_unique_ptr.
+  atomic_unique_ptr& operator=(atomic_unique_ptr&& p) noexcept {
+    reset(p.release());
+    return *this;
+  }
+
+  // Move the managed object ownership from a unique_ptr.
+  atomic_unique_ptr(unique_pointer&& p) noexcept : ptr(p.release()) {}
+  // Move the managed object ownership from a unique_ptr.
+  atomic_unique_ptr& operator=(unique_pointer&& p) noexcept {
+    reset(p.release());
+    return *this;
+  }
+
+  // Replace the managed pointer, deleting the old one.
+  void reset(pointer p = pointer()) noexcept {
+    auto old = ptr.exchange(p, std::memory_order_acq_rel);
+    if (old) delete old;
+  }
+  // Release ownership of and delete the owned pointer.
+  ~atomic_unique_ptr() { reset(); }
+
+  // Returns the managed pointer.
+  operator pointer() const noexcept { return get(); }
+  // Returns the managed pointer.
+  pointer operator->() const noexcept { return get(); }
+  // Returns the managed pointer.
+  pointer get() const noexcept {
+    return ptr.load(std::memory_order_acquire);
+  }
+
+  // Checks whether there is a managed pointer.
+  explicit operator bool() const noexcept { return get() != pointer(); }
+
+  // Replace the managed pointer, only releasing returning the old one.
+  pointer set(pointer p = pointer()) noexcept {
+    return ptr.exchange(p, std::memory_order_acq_rel);
+  }
+  // Return the managed pointer and release its ownership.
+  pointer release() noexcept { return set(pointer()); }
+
+  // Move managed pointer from @source, iff the managed pointer equals
+  // @expected.
+  bool compare_exchange(pointer& expected,
+                        atomic_unique_ptr<T>& source) noexcept {
+    if (ptr.compare_exchange_strong(expected, source.get(),
+                                    std::memory_order_acq_rel)) {
+      source.release();
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+ private:
+  std::atomic<pointer> ptr;
+};
+
+class Node;
+class Edge {
+ public:
+  // Creates array of edges from the list of moves.
+  static std::unique_ptr<Edge[]> FromMovelist(const MoveList& moves);
+
+  // Returns move from the point of view of the player making it (if as_opponent
+  // is false) or as opponent (if as_opponent is true).
+  Move GetMove(bool as_opponent = false) const;
+
+  // Returns or sets value of Move policy prior returned from the neural net
+  // (but can be changed by adding Dirichlet noise). Must be in [0,1].
+  float GetP() const;
+  void SetP(float val);
+
+  // Debug information about the edge.
+  std::string DebugString() const;
+
+  static void SortEdges(Edge* edges, int num_edges);
+
+ private:
+  // Move corresponding to this node. From the point of view of a player,
+  // i.e. black's e7e5 is stored as e2e4.
+  // Root node contains move a1a1.
+  Move move_;
+
+  // Probability that this move will be made, from the policy head of the neural
+  // network; compressed to a 16 bit format (5 bits exp, 11 bits significand).
+  uint16_t p_ = 0;
+  friend class Node;
+};
+
+struct Eval {
+  float wl;
+  float d;
+  float ml;
+};
+
+struct NNEval {
+  // To minimize the number of padding bytes and to avoid having unnecessary
+  // padding when new fields are added, we arrange the fields by size, largest
+  // to smallest.
+
+  // 8 byte fields on 64-bit platforms, 4 byte on 32-bit.
+  // Array of edges.
+  std::unique_ptr<Edge[]> edges;
+
+  // 4 byte fields.
+  float q = 0.0f;
+  float d = 0.0f;
+  float m = 0.0f;
+
+  // 1 byte fields.
+  // Number of edges in @edges.
+  uint8_t num_edges = 0;
+};
+
+typedef std::pair<GameResult, GameResult> Bounds;
+
+enum class Terminal : uint8_t { NonTerminal, EndOfGame, Tablebase };
+
+class EdgeAndNode;
+template <bool is_const>
+class Edge_Iterator;
+
+template <bool is_const>
+class VisitedNode_Iterator;
+
+class NodeGarbageCollector;
+class ReleaseNodesWork;
+
+class LowNode;
+class Node {
+ public:
+  using Iterator = Edge_Iterator<false>;
+  using ConstIterator = Edge_Iterator<true>;
+
+  // Takes own @index in the parent.
+  Node(uint16_t index)
+      : index_(index),
+        terminal_type_(Terminal::NonTerminal),
+        lower_bound_(GameResult::BLACK_WON),
+        upper_bound_(GameResult::WHITE_WON),
+        repetition_(false) {}
+  // Takes own @edge and @index in the parent.
+  Node(const Edge& edge, uint16_t index)
+      : edge_(edge),
+        index_(index),
+        terminal_type_(Terminal::NonTerminal),
+        lower_bound_(GameResult::BLACK_WON),
+        upper_bound_(GameResult::WHITE_WON),
+        repetition_(false) {}
+  ~Node();
+
+  // Trim node, resetting everything except parent, sibling, edge and index.
+  void Trim();
+
+  // Allocates a new edge and a new node. The node has to be without edges
+  // before that.
+  Node* CreateSingleChildNode(Move move) {
+    assert(!low_node_);
+    auto low_node = std::make_shared<LowNode>(MoveList({move}), 0);
+    SetLowNode(low_node);
+    return GetChild();
+  }
+
+  // Get first child.
+  Node* GetChild() const;
+  // Get next sibling.
+  atomic_unique_ptr<Node>* GetSibling() { return &sibling_; }
+  // Moves sibling in.
+  void MoveSiblingIn(std::unique_ptr<Node>& sibling) {
+    sibling_ = std::move(sibling);
+  }
+
+  // Returns whether a node has children.
+  bool HasChildren() const;
+
+  // Returns sum of policy priors which have had at least one playout.
+  float GetVisitedPolicy() const;
+  uint32_t GetN() const { return n_; }
+  uint32_t GetNInFlight() const;
+  uint32_t GetChildrenVisits() const;
+  uint32_t GetTotalVisits() const;
+  // Returns n + n_in_flight.
+  int GetNStarted() const { return n_ + GetNInFlight(); }
+
+  float GetQ(float draw_score) const { return wl_ + draw_score * d_; }
+  // Returns node eval, i.e. average subtree V for non-terminal node and -1/0/1
+  // for terminal nodes.
+  float GetWL() const { return wl_; }
+  float GetD() const { return d_; }
+  float GetM() const { return m_; }
+
+  // Returns whether the node is known to be draw/lose/win.
+  bool IsTerminal() const { return terminal_type_ != Terminal::NonTerminal; }
+  bool IsTbTerminal() const { return terminal_type_ == Terminal::Tablebase; }
+  Bounds GetBounds() const { return {lower_bound_, upper_bound_}; }
+
+  uint8_t GetNumEdges() const;
+
+  // Makes the node terminal and sets it's score.
+  void MakeTerminal(GameResult result, float plies_left = 1.0f,
+                    Terminal type = Terminal::EndOfGame);
+  // Makes the node not terminal and recomputes bounds, visits and values.
+  // Changes low node as well unless @also_low_node is false.
+  void MakeNotTerminal(bool also_low_node = true);
+  void SetBounds(GameResult lower, GameResult upper);
+
+  // If this node is not in the process of being expanded by another thread
+  // (which can happen only if n==0 and n-in-flight==1), mark the node as
+  // "being updated" by incrementing n-in-flight, and return true.
+  // Otherwise return false.
+  bool TryStartScoreUpdate();
+  // Decrements n-in-flight back.
+  void CancelScoreUpdate(uint32_t multivisit);
+  // Updates the node with newly computed value v.
+  // Updates:
+  // * Q (weighted average of all V in a subtree)
+  // * N (+=multivisit)
+  // * N-in-flight (-=multivisit)
+  void FinalizeScoreUpdate(float v, float d, float m, uint32_t multivisit);
+  // Like FinalizeScoreUpdate, but it updates n existing visits by delta amount.
+  void AdjustForTerminal(float v, float d, float m, uint32_t multivisit);
+  // When search decides to treat one visit as several (in case of collisions
+  // or visiting terminal nodes several times), it amplifies the visit by
+  // incrementing n_in_flight.
+  void IncrementNInFlight(uint32_t multivisit);
+
+  // Returns range for iterating over edges.
+  ConstIterator Edges() const;
+  Iterator Edges();
+
+  // Returns range for iterating over child nodes with N > 0.
+  VisitedNode_Iterator<true> VisitedNodes() const;
+  VisitedNode_Iterator<false> VisitedNodes();
+
+  // Deletes all children except one.
+  // The node provided may be moved, so should not be relied upon to exist
+  // afterwards.
+  void ReleaseChildrenExceptOne(Node* node_to_save) const;
+
+  // Returns move from the point of view of the player making it (if as_opponent
+  // is false) or as opponent (if as_opponent is true).
+  Move GetMove(bool as_opponent = false) const {
+    return edge_.GetMove(as_opponent);
+  }
+  // Returns or sets value of Move policy prior returned from the neural net
+  // (but can be changed by adding Dirichlet noise or when turning terminal).
+  // Must be in [0,1].
+  float GetP() const { return edge_.GetP(); }
+  void SetP(float val) { edge_.SetP(val); }
+
+  const std::shared_ptr<LowNode>& GetLowNode() const { return low_node_; }
+
+  void SetLowNode(std::shared_ptr<LowNode> low_node);
+  void UnsetLowNode();
+
+  // Debug information about the node.
+  std::string DebugString() const;
+  // Return string describing the edge from node's parent to its low node in the
+  // Graphviz dot format.
+  void DotEdgeString(std::ofstream& file,
+                     bool as_opponent = false,
+                     const LowNode* parent = nullptr) const;
+  // Return string describing the graph starting at this node in the Graphviz
+  // dot format.
+  void DotGraphString(std::ofstream& file, bool as_opponent = false) const;
+
+  // Returns true if graph under this node has every n_in_flight_ == 0 and
+  // prints offending nodes and low nodes and stats to cerr otherwise.
+  bool ZeroNInFlight() const;
+
+  void SortEdges() const;
+
+  // Index in parent's edges - useful for correlated ordering.
+  uint16_t Index() const { return index_; }
+
+  void SetRepetition() { repetition_ = true; }
+  bool IsRepetition() const { return repetition_; }
+
+  bool WLDMInvariantsHold() const;
+
+#ifndef NDEBUG
+  // RAII holder was a visitor. It will automatically release the reservation
+  // when going out of scope. It is possible to use visitor for branches. There
+  // must be a full tree walk before id value wraps arround or walk will ignore
+  // some nodes.
+  // It doesn't support concurrent access currently. API emulates mutexes which
+  // makes it possible to add limited number of concurrent access and waiting
+  // for free resources if needed.
+  struct VisitorId {
+    using type = uint32_t;
+    using storage = uint32_t;
+
+    VisitorId(const VisitorId&) = delete;
+
+    explicit VisitorId();
+    ~VisitorId();
+
+    operator type() const {
+      return id_;
+    }
+
+    friend class Node;
+    friend class LowNode;
+  private:
+    type id_;
+  };
+#endif
+
+ private:
+  // To minimize the number of padding bytes and to avoid having unnecessary
+  // padding when new fields are added, we arrange the fields by size, largest
+  // to smallest.
+
+  // 16 byte fields on 64-bit platforms, 8 byte on 32-bit.
+  // Shared pointer to the low node.
+  std::shared_ptr<LowNode> low_node_;
+
+  // 8 byte fields.
+  // Average value (from value head of neural network) of all visited nodes in
+  // subtree. For terminal nodes, eval is stored. This is from the perspective
+  // of the player who "just" moved to reach this position, rather than from
+  // the perspective of the player-to-move for the position. WL stands for "W
+  // minus L". Is equal to Q if draw score is 0.
+  double wl_ = 0.0f;
+  // Averaged draw probability. Works similarly to WL, except that D is not
+  // flipped depending on the side to move.
+  double d_ = 0.0f;
+
+  // 8 byte fields on 64-bit platforms, 4 byte on 32-bit.
+  // Pointer to a next sibling. nullptr if there are no further siblings.
+  atomic_unique_ptr<Node> sibling_;
+
+  // 4 byte fields.
+  // Estimated remaining plies.
+  float m_ = 0.0f;
+  // How many completed visits this node had.
+  uint32_t n_ = 0;
+  // (AKA virtual loss.) How many threads currently process this node (started
+  // but not finished). This value is added to n during selection which node
+  // to pick in MCTS, and also when selecting the best move.
+  std::atomic<uint32_t> n_in_flight_ = 0;
+
+  // Move and policy for this edge.
+  Edge edge_;
+
+  // 2 byte fields.
+  // Index of this node is parent's edge list.
+  uint16_t index_;
+
+  // 1 byte fields.
+  // Bit fields using parts of uint8_t fields initialized in the constructor.
+  // Whether or not this node end game (with a winning of either sides or
+  // draw).
+  Terminal terminal_type_ : 2;
+  // Best and worst result for this node.
+  GameResult lower_bound_ : 2;
+  GameResult upper_bound_ : 2;
+  // Edge was handled as a repetition at some point.
+  bool repetition_ : 1;
+};
+
+// Check that Node still fits into an expected cache line size.
+static_assert(sizeof(Node) <= 64, "Node is too large");
+
+class LowNode {
+ public:
+  LowNode()
+      : terminal_type_(Terminal::NonTerminal),
+        lower_bound_(GameResult::BLACK_WON),
+        upper_bound_(GameResult::WHITE_WON) {}
+  // Init from from another low node, but use it for NNEval only.
+  LowNode(const LowNode& p)
+      : wl_(p.wl_),
+        d_(p.d_),
+        m_(p.m_),
+        num_edges_(p.num_edges_),
+        terminal_type_(Terminal::NonTerminal),
+        lower_bound_(GameResult::BLACK_WON),
+        upper_bound_(GameResult::WHITE_WON) {
+    assert(p.edges_);
+    edges_ = std::make_unique<Edge[]>(num_edges_);
+    std::memcpy(edges_.get(), p.edges_.get(), num_edges_ * sizeof(Edge));
+  }
+  // Init @edges_ with moves from @moves and 0 policy.
+  LowNode(const MoveList& moves)
+      : num_edges_(moves.size()),
+        terminal_type_(Terminal::NonTerminal),
+        lower_bound_(GameResult::BLACK_WON),
+        upper_bound_(GameResult::WHITE_WON) {
+    edges_ = Edge::FromMovelist(moves);
+  }
+  // Init @edges_ with moves from @moves and 0 policy.
+  // Also create the first child at @index.
+  LowNode(const MoveList& moves, uint16_t index)
+      : num_edges_(moves.size()),
+        terminal_type_(Terminal::NonTerminal),
+        lower_bound_(GameResult::BLACK_WON),
+        upper_bound_(GameResult::WHITE_WON) {
+    edges_ = Edge::FromMovelist(moves);
+    child_ = std::make_unique<Node>(edges_[index], index);
+  }
+  ~LowNode();
+
+  void SetNNEval(const EvalResult* eval) {
+    assert(n_ == 0);
+    assert(!child_);
+
+    for (size_t idx = 0; idx < num_edges_; idx++) {
+      edges_.get()[idx].SetP(eval->p[idx]);
+    }
+
+    wl_ = eval->q;
+    d_ = eval->d;
+    m_ = eval->m;
+
+    assert(WLDMInvariantsHold());
+  }
+
+  // Gets the first child.
+  atomic_unique_ptr<Node>* GetChild() { return &child_; }
+
+  // Returns whether a node has children.
+  bool HasChildren() const { return num_edges_ > 0; }
+
+  uint32_t GetN() const { return n_; }
+  uint32_t GetChildrenVisits() const { return n_ - 1; }
+
+  // Returns node eval, i.e. average subtree V for non-terminal node and -1/0/1
+  // for terminal nodes.
+  float GetWL() const { return wl_; }
+  float GetD() const { return d_; }
+  float GetM() const { return m_; }
+
+  // Returns whether the node is known to be draw/loss/win.
+  bool IsTerminal() const { return terminal_type_ != Terminal::NonTerminal; }
+  Bounds GetBounds() const { return {lower_bound_, upper_bound_}; }
+  Terminal GetTerminalType() const { return terminal_type_; }
+
+  uint8_t GetNumEdges() const { return num_edges_; }
+  // Gets pointer to the start of the edge array.
+  Edge* GetEdges() const { return edges_.get(); }
+
+  // Makes the node terminal and sets it's score.
+  void MakeTerminal(GameResult result, float plies_left = 0.0f,
+                    Terminal type = Terminal::EndOfGame);
+  // Makes the low node not terminal and recomputes bounds, visits and values
+  // using incoming @node.
+  void MakeNotTerminal(const Node* node);
+  void SetBounds(GameResult lower, GameResult upper);
+
+  // Decrements n-in-flight back.
+  void CancelScoreUpdate(uint32_t multivisit);
+  // Updates the node with newly computed value v.
+  // Updates:
+  // * Q (weighted average of all V in a subtree)
+  // * N (+=multivisit)
+  // * N-in-flight (-=multivisit)
+  void FinalizeScoreUpdate(float v, float d, float m, uint32_t multivisit);
+  // Like FinalizeScoreUpdate, but it updates n existing visits by delta amount.
+  void AdjustForTerminal(float v, float d, float m, uint32_t multivisit);
+
+  // Deletes all children.
+  void ReleaseChildren();
+
+  // Deletes all children except one.
+  // The node provided may be moved, so should not be relied upon to exist
+  // afterwards.
+  void ReleaseChildrenExceptOne(Node* node_to_save);
+
+  // Return move policy for edge/node at @index.
+  const Edge& GetEdgeAt(uint16_t index) const;
+
+  // Debug information about the node.
+  std::string DebugString() const;
+  // Return string describing this node in the Graphviz dot format.
+  void DotNodeString(std::ofstream& file) const;
+
+  void SortEdges() {
+    assert(edges_);
+    assert(!child_);
+    Edge::SortEdges(edges_.get(), num_edges_);
+  }
+
+  // Add new parent with @n_in_flight visits.
+  void AddParent() {
+    num_parents_.fetch_add(1, std::memory_order_acq_rel);
+
+    assert(num_parents_ > 0);
+  }
+  // Remove parent and its first visit.
+  void RemoveParent() {
+    assert(num_parents_ > 0);
+    num_parents_.fetch_sub(1, std::memory_order_acq_rel);
+  }
+  bool IsTransposition() const {
+    return num_parents_.load(std::memory_order_acquire) > 1;
+  }
+
+  bool WLDMInvariantsHold() const;
+
+#ifndef NDEBUG
+  bool Visit(Node::VisitorId::type id);
+#endif
+
+ private:
+  // To minimize the number of padding bytes and to avoid having unnecessary
+  // padding when new fields are added, we arrange the fields by size, largest
+  // to smallest.
+
+  // 8 byte fields.
+  // Average value (from value head of neural network) of all visited nodes in
+  // subtree. For terminal nodes, eval is stored. This is from the perspective
+  // of the player who "just" moved to reach this position, rather than from the
+  // perspective of the player-to-move for the position.
+  // WL stands for "W minus L". Is equal to Q if draw score is 0.
+  double wl_ = 0.0f;
+  // Averaged draw probability. Works similarly to WL, except that D is not
+  // flipped depending on the side to move.
+  double d_ = 0.0f;
+
+  // 8 byte fields on 64-bit platforms, 4 byte on 32-bit.
+  // Array of edges.
+  std::unique_ptr<Edge[]> edges_;
+  // Pointer to the first child. nullptr when no children.
+  atomic_unique_ptr<Node> child_;
+
+  // 4 byte fields.
+  // Estimated remaining plies.
+  float m_ = 0.0f;
+  // How many completed visits this node had.
+  uint32_t n_ = 0;
+
+  // 2 byte fields.
+  // Number of parents.
+  std::atomic<uint16_t> num_parents_ = {};
+
+  // 1 byte fields.
+  // Number of edges in @edges_.
+  uint8_t num_edges_ = 0;
+  // Bit fields using parts of uint8_t fields initialized in the constructor.
+  // Whether or not this node end game (with a winning of either sides or draw).
+  Terminal terminal_type_ : 2;
+  // Best and worst result for this node.
+  GameResult lower_bound_ : 2;
+  GameResult upper_bound_ : 2;
+  // Debug only id as the last to avoid taking place of actively used variables
+  // in the cache.
+#ifndef NDEBUG
+  Node::VisitorId::storage visitor_id_ = {};
+#endif
+};
+
+// Check that LowNode still fits into an expected cache line size.
+static_assert(sizeof(LowNode) <= 64, "LowNode is too large");
+
+// Contains Edge and Node pair and set of proxy functions to simplify access
+// to them.
+class EdgeAndNode {
+ public:
+  EdgeAndNode() = default;
+  EdgeAndNode(Edge* edge, Node* node) : edge_(edge), node_(node) {}
+  void Reset() { edge_ = nullptr; }
+  explicit operator bool() const { return edge_ != nullptr; }
+  bool operator==(const EdgeAndNode& other) const {
+    return edge_ == other.edge_;
+  }
+  bool operator!=(const EdgeAndNode& other) const {
+    return edge_ != other.edge_;
+  }
+  bool HasNode() const { return node_ != nullptr; }
+  Edge* edge() const { return edge_; }
+  Node* node() const { return node_; }
+
+  // Proxy functions for easier access to node/edge.
+  float GetQ(float default_q, float draw_score) const {
+    return (node_ && node_->GetN() > 0) ? node_->GetQ(draw_score) : default_q;
+  }
+  float GetWL(float default_wl) const {
+    return (node_ && node_->GetN() > 0) ? node_->GetWL() : default_wl;
+  }
+  float GetD(float default_d) const {
+    return (node_ && node_->GetN() > 0) ? node_->GetD() : default_d;
+  }
+  float GetM(float default_m) const {
+    return (node_ && node_->GetN() > 0) ? node_->GetM() : default_m;
+  }
+  // N-related getters, from Node (if exists).
+  uint32_t GetN() const { return node_ ? node_->GetN() : 0; }
+  int GetNStarted() const { return node_ ? node_->GetNStarted() : 0; }
+  uint32_t GetNInFlight() const { return node_ ? node_->GetNInFlight() : 0; }
+
+  // Whether the node is known to be terminal.
+  bool IsTerminal() const { return node_ ? node_->IsTerminal() : false; }
+  bool IsTbTerminal() const { return node_ ? node_->IsTbTerminal() : false; }
+  Bounds GetBounds() const {
+    return node_ ? node_->GetBounds()
+                 : Bounds{GameResult::BLACK_WON, GameResult::WHITE_WON};
+  }
+
+  // Edge related getters.
+  float GetP() const {
+    return node_ != nullptr ? node_->GetP() : edge_->GetP();
+  }
+  Move GetMove(bool flip = false) const {
+    return edge_ ? edge_->GetMove(flip) : Move();
+  }
+
+  // Returns U = numerator * p / N.
+  // Passed numerator is expected to be equal to (cpuct * sqrt(N[parent])).
+  float GetU(float numerator) const {
+    return numerator * GetP() / (1 + GetNStarted());
+  }
+
+  std::string DebugString() const;
+
+ protected:
+  // nullptr means that the whole pair is "null". (E.g. when search for a node
+  // didn't find anything, or as end iterator signal).
+  Edge* edge_ = nullptr;
+  // nullptr means that the edge doesn't yet have node extended.
+  Node* node_ = nullptr;
+};
+
+// TODO(crem) Replace this with less hacky iterator once we support C++17.
+// This class has multiple hypostases within one class:
+// * Range (begin() and end() functions)
+// * Iterator (operator++() and operator*())
+// * Element, pointed by iterator (EdgeAndNode class mainly, but Edge_Iterator
+//   is useful too when client wants to call GetOrSpawnNode).
+//   It's safe to slice EdgeAndNode off Edge_Iterator.
+// It's more customary to have those as three classes, but
+// creating zoo of classes and copying them around while iterating seems
+// excessive.
+//
+// All functions are not thread safe (must be externally synchronized), but
+// it's fine if GetOrSpawnNode is called between calls to functions of the
+// iterator (e.g. advancing the iterator). Other functions that manipulate
+// child_ of parent or the sibling chain are not safe to call while iterating.
+template <bool is_const>
+class Edge_Iterator : public EdgeAndNode {
+ public:
+  using Ptr = std::conditional_t<is_const, const atomic_unique_ptr<Node>*,
+                                 atomic_unique_ptr<Node>*>;
+  using value_type = Edge_Iterator;
+  using iterator_category = std::forward_iterator_tag;
+  using difference_type = std::ptrdiff_t;
+  using pointer = Edge_Iterator*;
+  using reference = Edge_Iterator&;
+
+  // Creates "end()" iterator.
+  Edge_Iterator() {}
+
+  // Creates "begin()" iterator.
+  Edge_Iterator(LowNode* parent_node)
+      : EdgeAndNode(parent_node != nullptr ? parent_node->GetEdges() : nullptr,
+                    nullptr) {
+    if (parent_node != nullptr) {
+      node_ptr_ = parent_node->GetChild();
+      total_count_ = parent_node->GetNumEdges();
+      if (edge_) Actualize();
+    }
+  }
+
+  // Function to support range interface.
+  Edge_Iterator<is_const> begin() { return *this; }
+  Edge_Iterator<is_const> end() { return {}; }
+
+  // Functions to support iterator interface.
+  // Equality comparison operators are inherited from EdgeAndNode.
+  void operator++() {
+    // If it was the last edge in array, become end(), otherwise advance.
+    if (++current_idx_ == total_count_) {
+      edge_ = nullptr;
+    } else {
+      ++edge_;
+      Actualize();
+    }
+  }
+  Edge_Iterator& operator*() { return *this; }
+
+  // If there is node, return it. Otherwise spawn a new one and return it.
+  Node* GetOrSpawnNode(Node* parent) {
+    if (node_) return node_;  // If there is already a node, return it.
+
+    // We likely need to add a new node, prepare it now.
+    auto low_parent = parent->GetLowNode()->GetEdgeAt(current_idx_);
+    atomic_unique_ptr<Node> new_node =
+        std::make_unique<Node>(low_parent, current_idx_);
+    while (true) {
+      auto node = Actualize();  // But maybe other thread already did that.
+      if (node_) return node_;  // If it did, return.
+
+      // New node needs to be added, but we might be in a race with another
+      // thread doing what we do or adding a different index to the same
+      // sibling.
+
+      // Suppose there are nodes with idx 3 and 7, and we want to insert one
+      // with idx 5. Here is how it looks like:
+      //    node_ptr_ -> &Node(idx_.3).sibling_  ->  Node(idx_.7)
+      // Here is how we do that:
+      // 1. Store pointer to a node idx_.7:
+      //    node_ptr_ -> &Node(idx_.3).sibling_  ->  nullptr
+      //    tmp -> Node(idx_.7)
+      // 2. Create fresh Node(idx_.5):
+      //    node_ptr_ -> &Node(idx_.3).sibling_  ->  Node(idx_.5)
+      //    tmp -> Node(idx_.7)
+      // 3. Attach stored pointer back to a list:
+      //    node_ptr_ ->
+      //         &Node(idx_.3).sibling_ -> Node(idx_.5).sibling_ -> Node(idx_.7)
+
+      // Atomically add the new node into the right place.
+      // Set new node's sibling to the expected sibling seen by Actualize in
+      // node_ptr_.
+      auto new_sibling = new_node->GetSibling();
+      new_sibling->set(node);
+      // Try to atomically insert the new node and stop if it works.
+      if (node_ptr_->compare_exchange(node, new_node)) break;
+      // Recover from failure and try again.
+      // Release expected sibling to avoid double free.
+      new_sibling->release();
+    }
+    // 4. Actualize:
+    //    node_ -> &Node(idx_.5)
+    //    node_ptr_ -> &Node(idx_.5).sibling_ -> Node(idx_.7)
+    Actualize();
+    return node_;
+  }
+
+ private:
+  // Moves node_ptr_ as close as possible to the target index and returns the
+  // contents of node_ptr_ for use by atomic insert in GetOrSpawnNode.
+  Node* Actualize() {
+    // If node_ptr_ is behind, advance it.
+    // This is needed (and has to be 'while' rather than 'if') as other threads
+    // could spawn new nodes between &node_ptr_ and *node_ptr_ while we didn't
+    // see.
+    // Read the direct pointer just once as other threads may change it between
+    // uses.
+    auto node = node_ptr_->get();
+    while (node != nullptr && node->Index() < current_idx_) {
+      node_ptr_ = node->GetSibling();
+      node = node_ptr_->get();
+    }
+    // If in the end node_ptr_ points to the node that we need, populate node_
+    // and advance node_ptr_.
+    if (node != nullptr && node->Index() == current_idx_) {
+      node_ = node;
+      node_ptr_ = node->GetSibling();
+    } else {
+      node_ = nullptr;
+    }
+
+    return node;
+  }
+
+  // Pointer to a pointer to the next node. Has to be a pointer to pointer
+  // as we'd like to update it when spawning a new node.
+  Ptr node_ptr_;
+  uint16_t current_idx_ = 0;
+  uint16_t total_count_ = 0;
+};
+
+inline Node::ConstIterator Node::Edges() const {
+  return {this->GetLowNode().get()};
+}
+inline Node::Iterator Node::Edges() { return {this->GetLowNode().get()}; }
+
+// TODO(crem) Replace this with less hacky iterator once we support C++17.
+// This class has multiple hypostases within one class:
+// * Range (begin() and end() functions)
+// * Iterator (operator++() and operator*())
+// It's more customary to have those as two classes, but
+// creating zoo of classes and copying them around while iterating seems
+// excessive.
+//
+// All functions are not thread safe (must be externally synchronized).
+template <bool is_const>
+class VisitedNode_Iterator {
+ public:
+  // Creates "end()" iterator.
+  VisitedNode_Iterator() {}
+
+  // Creates "begin()" iterator.
+  VisitedNode_Iterator(LowNode* parent_node) {
+    if (parent_node != nullptr) {
+      node_ptr_ = parent_node->GetChild()->get();
+      total_count_ = parent_node->GetNumEdges();
+      if (node_ptr_ != nullptr && node_ptr_->GetN() == 0) {
+        operator++();
+      }
+    }
+  }
+
+  // These are technically wrong, but are usable to compare with end().
+  bool operator==(const VisitedNode_Iterator<is_const>& other) const {
+    return node_ptr_ == other.node_ptr_;
+  }
+  bool operator!=(const VisitedNode_Iterator<is_const>& other) const {
+    return node_ptr_ != other.node_ptr_;
+  }
+
+  // Function to support range interface.
+  VisitedNode_Iterator<is_const> begin() { return *this; }
+  VisitedNode_Iterator<is_const> end() { return {}; }
+
+  // Functions to support iterator interface.
+  // Equality comparison operators are inherited from EdgeAndNode.
+  void operator++() {
+    do {
+      node_ptr_ = node_ptr_->GetSibling()->get();
+      // If n started is 0, can jump direct to end due to sorted policy
+      // ensuring that each time a new edge becomes best for the first time,
+      // it is always the first of the section at the end that has NStarted of
+      // 0.
+      if (node_ptr_ != nullptr && node_ptr_->GetN() == 0 &&
+          node_ptr_->GetNInFlight() == 0) {
+        node_ptr_ = nullptr;
+        break;
+      }
+    } while (node_ptr_ != nullptr && node_ptr_->GetN() == 0);
+  }
+  Node* operator*() { return node_ptr_; }
+
+ private:
+  // Pointer to current node.
+  Node* node_ptr_ = nullptr;
+  uint16_t current_idx_ = 0;
+  uint16_t total_count_ = 0;
+};
+
+inline VisitedNode_Iterator<true> Node::VisitedNodes() const {
+  return {this->GetLowNode().get()};
+}
+inline VisitedNode_Iterator<false> Node::VisitedNodes() {
+  return {this->GetLowNode().get()};
+}
+
+// Transposition Table type for holding references to all low nodes in DAG.
+typedef absl::flat_hash_map<uint64_t, std::weak_ptr<LowNode>>
+    TranspositionTable;
+
+class NodeTree {
+ public:
+  ~NodeTree();
+  // Adds a move to current_head_.
+  void MakeMove(Move move);
+  // Resets the current head to ensure it doesn't carry over details from a
+  // previous search.
+  void TrimTreeAtHead();
+  // Sets the position in the tree, trying to reuse the tree.
+  // If @auto_garbage_collect, old tree is garbage collected immediately. (may
+  // take some milliseconds)
+  // Returns whether the new position is the same game as the old position (with
+  // some moves added). Returns false, if the position is completely different,
+  // or if it's shorter than before.
+  bool ResetToPosition(const std::string& starting_fen,
+                       const std::vector<std::string>& moves);
+  bool ResetToPosition(const GameState& pos);
+  const Position& HeadPosition() const { return history_.Last(); }
+  int GetPlyCount() const { return HeadPosition().GetGamePly(); }
+  bool IsBlackToMove() const { return HeadPosition().IsBlackToMove(); }
+  Node* GetCurrentHead() const { return current_head_; }
+  Node* GetGameBeginNode() const { return gamebegin_node_.get(); }
+  const PositionHistory& GetPositionHistory() const { return history_; }
+  const std::vector<Move>& GetMoves() const { return moves_; }
+
+ private:
+  void DeallocateTree();
+  // A node which to start search from.
+  Node* current_head_ = nullptr;
+  // Root node of a game tree.
+  std::unique_ptr<Node> gamebegin_node_;
+  PositionHistory history_;
+  std::vector<Move> moves_;
+};
+
+// Implement thread local queues. It tracks GC thread to allow faster removal in
+// the thread.
+class ReleaseNodesWork {
+  static constexpr size_t kCapacity = 32;
+public:
+  ReleaseNodesWork(bool gc_thread = false);
+  ~ReleaseNodesWork();
+  bool IsWorker() const;
+
+  // A limited vector like interface to operate on the container.
+  void emplace_back(std::unique_ptr<Node>&& node);
+  bool empty() const;
+
+  // Swap is used to transfer queue into a new stack variable. The stack
+  // variable will flush the queue in the desctructor.
+  void swap(ReleaseNodesWork &other);
+private:
+  // Flush the local queue to the shared queue.
+  void Submit();
+
+  // No locks required because only one thread can access this object.
+  std::vector<std::unique_ptr<Node>> released_nodes_;
+  bool is_gc_thread_;
+};
+
+class NodeGarbageCollector {
+  NodeGarbageCollector();
+  ~NodeGarbageCollector();
+public:
+  enum State {
+    Running,
+    GoToSleep,
+    Sleeping,
+    Exit,
+  };
+
+  // Access to the singleton which is only created on the demand.
+  static NodeGarbageCollector& Instance() {
+    static NodeGarbageCollector singleton;
+    return singleton;
+  }
+  // Delays node destruction until GC thread activates.
+  template<typename UniquePtr>
+  void AddToGcQueue(UniquePtr& node);
+
+  // Allow search to control when garbage collection runs.
+  void Start();
+  void Stop();
+  State Wait() const;
+  void Abort();
+
+  // Moves thread local GC queue to the shared queue. This avoid case where a
+  // thread frees only a few branches which will be stuck in the thread local
+  // queue. A few big branches can have a major memory impact. If thread exits,
+  // there is no need to call this.
+  void NotifyThreadGoingSleep();
+
+private:
+  // Helper to transition between states safely
+  bool SetState(State& old, State desired);
+  bool IsActive() const;
+  bool ShouldQueue(std::unique_ptr<Node>& node) const;
+  // The collection thread implementation.
+  void GCThread();
+  // Thread local collection queue. Local queues flush to the shared queue
+  // in batches to avoid lock contention.
+  static ReleaseNodesWork& LocalWork(bool gc_thread = false) {
+    static thread_local ReleaseNodesWork shared{gc_thread};
+    return shared;
+  }
+
+  std::atomic<State> state_ = {Sleeping};
+#ifdef NO_STD_ATOMIC_WAIT
+  // Fallback conditional variable when c++ library doesn't implement
+  // std::atomic::wait().
+  mutable Mutex state_mutex_;
+  mutable std::condition_variable state_signal_;
+#endif
+  std::thread gc_thread_;
+  SpinMutex mutex_;
+  std::deque<std::vector<std::unique_ptr<Node>>> released_nodes_ GUARDED_BY(mutex_);
+
+  friend class ReleaseNodesWork;
+};
+
+}  // namespace dag_classic
+}  // namespace lczero
diff --git a/src/search/dag_classic/params.h b/src/search/dag_classic/params.h
new file mode 100644
index 0000000000..c3f3da8c84
--- /dev/null
+++ b/src/search/dag_classic/params.h
@@ -0,0 +1,39 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2018-2025 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#pragma once
+
+#include "search/classic/params.h"
+
+namespace lczero {
+namespace dag_classic {
+
+using ContemptMode = classic::ContemptMode;
+using SearchParams = classic::BaseSearchParams;
+
+}  // namespace dag_classic
+}  // namespace lczero
diff --git a/src/search/dag_classic/search.cc b/src/search/dag_classic/search.cc
new file mode 100644
index 0000000000..6861371dc6
--- /dev/null
+++ b/src/search/dag_classic/search.cc
@@ -0,0 +1,2484 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2018-2023 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include "search/dag_classic/search.h"
+
+#include <absl/cleanup/cleanup.h>
+
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <cmath>
+#include <iomanip>
+#include <iostream>
+#include <iterator>
+#include <sstream>
+#include <thread>
+
+#include "search/dag_classic/node.h"
+#include "utils/fastmath.h"
+#include "utils/random.h"
+#include "utils/spinhelper.h"
+#include "utils/trace.h"
+
+namespace lczero {
+namespace dag_classic {
+
+namespace {
+// Maximum delay between outputting "uci info" when nothing interesting happens.
+const int kUciInfoMinimumFrequencyMs = 5000;
+
+MoveList MakeRootMoveFilter(const MoveList& searchmoves,
+                            SyzygyTablebase* syzygy_tb,
+                            const PositionHistory& history, bool fast_play,
+                            std::atomic<int>* tb_hits, bool* dtz_success) {
+  assert(tb_hits);
+  assert(dtz_success);
+  // Search moves overrides tablebase.
+  if (!searchmoves.empty()) return searchmoves;
+  const auto& board = history.Last().GetBoard();
+  MoveList root_moves;
+  if (!syzygy_tb || !board.castlings().no_legal_castle() ||
+      (board.ours() | board.theirs()).count() > syzygy_tb->max_cardinality()) {
+    return root_moves;
+  }
+  if (syzygy_tb->root_probe(
+          history.Last(), fast_play || history.DidRepeatSinceLastZeroingMove(),
+          false, &root_moves)) {
+    *dtz_success = true;
+    tb_hits->fetch_add(1, std::memory_order_acq_rel);
+  } else if (syzygy_tb->root_probe_wdl(history.Last(), &root_moves)) {
+    tb_hits->fetch_add(1, std::memory_order_acq_rel);
+  }
+  return root_moves;
+}
+
+class MEvaluator {
+ public:
+  MEvaluator()
+      : enabled_{false},
+        m_slope_{0.0f},
+        m_cap_{0.0f},
+        a_constant_{0.0f},
+        a_linear_{0.0f},
+        a_square_{0.0f},
+        q_threshold_{0.0f},
+        parent_m_{0.0f} {}
+
+  MEvaluator(const SearchParams& params, const Node* parent = nullptr)
+      : enabled_{true},
+        m_slope_{params.GetMovesLeftSlope()},
+        m_cap_{params.GetMovesLeftMaxEffect()},
+        a_constant_{params.GetMovesLeftConstantFactor()},
+        a_linear_{params.GetMovesLeftScaledFactor()},
+        a_square_{params.GetMovesLeftQuadraticFactor()},
+        q_threshold_{params.GetMovesLeftThreshold()},
+        parent_m_{parent ? parent->GetM() : 0.0f},
+        parent_within_threshold_{parent ? WithinThreshold(parent, q_threshold_)
+                                        : false} {}
+
+  void SetParent(const Node* parent) {
+    assert(parent);
+    if (enabled_) {
+      parent_m_ = parent->GetM();
+      parent_within_threshold_ = WithinThreshold(parent, q_threshold_);
+    }
+  }
+
+  // Calculates the utility for favoring shorter wins and longer losses.
+  float GetMUtility(Node* child, float q) const {
+    if (!enabled_ || !parent_within_threshold_) return 0.0f;
+    const float child_m = child->GetM();
+    float m = std::clamp(m_slope_ * (child_m - parent_m_), -m_cap_, m_cap_);
+    m *= FastSign(-q);
+    if (q_threshold_ > 0.0f && q_threshold_ < 1.0f) {
+      // This allows a smooth M effect with higher q thresholds, which is
+      // necessary for using MLH together with contempt.
+      q = std::max(0.0f, (std::abs(q) - q_threshold_)) / (1.0f - q_threshold_);
+    }
+    m *= a_constant_ + a_linear_ * std::abs(q) + a_square_ * q * q;
+    return m;
+  }
+
+  float GetMUtility(const EdgeAndNode& child, float q) const {
+    if (!enabled_ || !parent_within_threshold_) return 0.0f;
+    if (child.GetN() == 0) return GetDefaultMUtility();
+    return GetMUtility(child.node(), q);
+  }
+
+  // The M utility to use for unvisited nodes.
+  float GetDefaultMUtility() const { return 0.0f; }
+
+ private:
+  static bool WithinThreshold(const Node* parent, float q_threshold) {
+    return std::abs(parent->GetQ(0.0f)) > q_threshold;
+  }
+
+  const bool enabled_;
+  const float m_slope_;
+  const float m_cap_;
+  const float a_constant_;
+  const float a_linear_;
+  const float a_square_;
+  const float q_threshold_;
+  float parent_m_ = 0.0f;
+  bool parent_within_threshold_ = false;
+};
+
+// Unpack task_count_ atomic which holds both task_count_ and tasks_taken_. It
+// can unpack a value from an already read value or load it from the atomic
+// variable.
+// Variables are packed together because there is a potential race between task
+// workers and ResetTasks. A task worker can read tasks_taken_ and task_count
+// to a local register. A task worker can be suspended by kernel before tries
+// to acquire work. Other threads can process all tasks and main thread resets
+// tasks before the suspended thread resumes. The suspended thread now manages
+// to acquire work based on stale values if the stale tasks_taken was zero.
+// Packed values avoid the race because compare exchange is checking both when
+// incrementing tasks_taken_.
+template<typename T>
+std::tuple<int, int, int> ReadTaskCount(T& task_count) {
+  int packed;
+  if constexpr(std::is_same_v<T, std::atomic<int>>) {
+    packed = task_count.load(std::memory_order_acquire);
+  } else {
+    packed = task_count;
+  }
+  // The top half is tasks taken.
+  const int shift = SearchWorker::kTasksTakenShift;
+  int tasks_taken = packed >> shift;
+  // The bottom is task count. The first shift moves the sign bit from the lower
+  // half to the hardware sign bit. The second shift lowers bits back to the
+  // original positions and duplicates the sign bit if it is set.
+  int tc = (packed << shift) >> shift;
+  return {packed, tasks_taken, tc};
+}
+
+[[maybe_unused]]
+bool IsTasksCompleted(const std::atomic<int>& task_count,
+                      const std::atomic<int>& completed_tasks) {
+  int tc = 0, nta = 0;
+  std::tie(std::ignore, nta, tc) = ReadTaskCount(task_count);
+  int ct = completed_tasks.load(std::memory_order_acquire);
+  return tc == ct || (nta == ct && tc == -1);
+}
+
+}  // namespace
+
+Search::Search(const NodeTree& tree, Backend* backend,
+               std::unique_ptr<UciResponder> uci_responder,
+               const MoveList& searchmoves,
+               std::chrono::steady_clock::time_point start_time,
+               std::unique_ptr<classic::SearchStopper> stopper, bool infinite,
+               bool ponder, const OptionsDict& options, TranspositionTable* tt,
+               SyzygyTablebase* syzygy_tb)
+    : ok_to_respond_bestmove_(!infinite && !ponder),
+      stopper_(std::move(stopper)),
+      root_node_(tree.GetCurrentHead()),
+      tt_(tt),
+      syzygy_tb_(syzygy_tb),
+      played_history_(tree.GetPositionHistory()),
+      backend_(backend),
+      backend_attributes_(backend->GetAttributes()),
+      params_(options),
+      searchmoves_(searchmoves),
+      start_time_(start_time),
+      initial_visits_(root_node_->GetN()),
+      root_move_filter_(MakeRootMoveFilter(
+          searchmoves_, syzygy_tb_, played_history_,
+          params_.GetSyzygyFastPlay(), &tb_hits_, &root_is_in_dtz_)),
+      uci_responder_(std::move(uci_responder)) {
+  // Evict expired entries from the transposition table.
+  // Garbage collection may lead to expiration at any time so this is not
+  // enough to prevent expired entries later during the search.
+  absl::erase_if(*tt_, [](const auto& item) { return item.second.expired(); });
+
+  LOGFILE << "Transposition table garbage collection done.";
+
+  if (params_.GetMaxConcurrentSearchers() != 0) {
+    pending_searchers_.store(params_.GetMaxConcurrentSearchers(),
+                             std::memory_order_release);
+  }
+  contempt_mode_ = params_.GetContemptMode();
+  // Make sure the contempt mode is never "play" beyond this point.
+  if (contempt_mode_ == ContemptMode::PLAY) {
+    if (infinite) {
+      // For infinite search disable contempt, only "white"/"black" make sense.
+      contempt_mode_ = ContemptMode::NONE;
+      // Issue a warning only if contempt mode would have an effect.
+      if (params_.GetWDLRescaleDiff() != 0.0f) {
+        std::vector<ThinkingInfo> info(1);
+        info.back().comment =
+            "WARNING: Contempt mode set to 'disable' as 'play' not supported "
+            "for infinite search.";
+        uci_responder_->OutputThinkingInfo(&info);
+      }
+    } else {
+      // Otherwise set it to the root move's side, unless pondering.
+      contempt_mode_ = played_history_.IsBlackToMove() != ponder
+                           ? ContemptMode::BLACK
+                           : ContemptMode::WHITE;
+    }
+  }
+}
+
+namespace {
+void ApplyDirichletNoise(LowNode* node, float eps, double alpha) {
+  float total = 0;
+  std::vector<float> noise;
+
+  for (int i = 0; i < node->GetNumEdges(); ++i) {
+    float eta = Random::Get().GetGamma(alpha, 1.0);
+    noise.emplace_back(eta);
+    total += eta;
+  }
+
+  if (total < std::numeric_limits<float>::min()) return;
+
+  int noise_idx = 0;
+  auto edges = node->GetEdges();
+  std::transform(edges, edges + node->GetNumEdges(), edges,
+      [&](auto edge) {
+        edge.SetP(edge.GetP() * (1 - eps) + eps * noise[noise_idx++] / total);
+        return edge;
+      });
+}
+}  // namespace
+
+namespace {
+// WDL conversion formula based on random walk model.
+inline double WDLRescale(float& v, float& d, float wdl_rescale_ratio,
+                         float wdl_rescale_diff, float sign, bool invert,
+                         float max_reasonable_s) {
+  if (invert) {
+    wdl_rescale_diff = -wdl_rescale_diff;
+    wdl_rescale_ratio = 1.0f / wdl_rescale_ratio;
+  }
+  auto w = (1 + v - d) / 2;
+  auto l = (1 - v - d) / 2;
+  // Safeguard against numerical issues; skip WDL transformation if WDL is too
+  // extreme.
+  const float eps = 0.0001f;
+  if (w > eps && d > eps && l > eps && w < (1.0f - eps) && d < (1.0f - eps) &&
+      l < (1.0f - eps)) {
+    auto a = FastLog(1 / l - 1);
+    auto b = FastLog(1 / w - 1);
+    auto s = 2 / (a + b);
+    // Safeguard against unrealistically broad WDL distributions coming from
+    // the NN. Originally hardcoded, made into a parameter for piece odds.
+    if (!invert) s = std::min(max_reasonable_s, s);
+    auto mu = (a - b) / (a + b);
+    auto s_new = s * wdl_rescale_ratio;
+    if (invert) {
+      std::swap(s, s_new);
+      s = std::min(max_reasonable_s, s);
+    }
+    auto mu_new = mu + sign * s * s * wdl_rescale_diff;
+    auto w_new = FastLogistic((-1.0f + mu_new) / s_new);
+    auto l_new = FastLogistic((-1.0f - mu_new) / s_new);
+    v = w_new - l_new;
+    d = std::max(0.0f, 1.0f - w_new - l_new);
+    return mu_new;
+  }
+  return 0;
+}
+}  // namespace
+
+void Search::SendUciInfo(const classic::IterationStats& stats)
+                         REQUIRES(nodes_mutex_) REQUIRES(counters_mutex_) {
+  const auto max_pv = params_.GetMultiPv();
+  const auto edges = GetBestChildrenNoTemperature(root_node_, max_pv, 0);
+  const auto score_type = params_.GetScoreType();
+  const auto per_pv_counters = params_.GetPerPvCounters();
+  const auto draw_score = GetDrawScore(false);
+
+  std::vector<ThinkingInfo> uci_infos;
+
+  // Info common for all multipv variants.
+  ThinkingInfo common_info;
+  common_info.depth = cum_depth_ / (total_playouts_ ? total_playouts_ : 1);
+  common_info.seldepth = max_depth_;
+  common_info.time = stats.time_since_movestart;
+  if (!per_pv_counters) {
+    common_info.nodes = total_playouts_ + initial_visits_;
+  }
+  if (stats.time_since_first_batch) {
+    const auto time_since_first_batch_ms = stats.time_since_first_batch;
+    if (time_since_first_batch_ms > 0) {
+      common_info.nps = total_playouts_ * 1000 / time_since_first_batch_ms;
+      common_info.eps = network_evaluations_ * 1000 / time_since_first_batch_ms;
+    }
+  }
+  common_info.tb_hits = tb_hits_.load(std::memory_order_acquire);
+
+  int multipv = 0;
+  const auto default_q = -root_node_->GetQ(-draw_score);
+  const auto default_wl = -root_node_->GetWL();
+  const auto default_d = root_node_->GetD();
+  for (const auto& edge : edges) {
+    ++multipv;
+    uci_infos.emplace_back(common_info);
+    auto& uci_info = uci_infos.back();
+    auto wl = edge.GetWL(default_wl);
+    auto d = edge.GetD(default_d);
+    float mu_uci = 0.0f;
+    if (score_type == "WDL_mu" || (params_.GetWDLRescaleDiff() != 0.0f &&
+                                   contempt_mode_ != ContemptMode::NONE)) {
+      auto sign = ((contempt_mode_ == ContemptMode::BLACK) ==
+                   played_history_.IsBlackToMove())
+                      ? 1.0f
+                      : -1.0f;
+      mu_uci = WDLRescale(
+          wl, d, params_.GetWDLRescaleRatio(),
+          contempt_mode_ == ContemptMode::NONE
+              ? 0
+              : params_.GetWDLRescaleDiff() * params_.GetWDLEvalObjectivity(),
+          sign, true, params_.GetWDLMaxS());
+    }
+    const auto q = edge.GetQ(default_q, draw_score);
+    if (edge.IsTerminal() && wl != 0.0f) {
+      uci_info.mate = std::copysign(
+          std::round(edge.GetM(0.0f) + 1) / 2 + (edge.IsTbTerminal() ? 100 : 0),
+          wl);
+    } else if (score_type == "centipawn_with_drawscore") {
+      uci_info.score = 90 * tan(1.5637541897 * q);
+    } else if (score_type == "centipawn") {
+      uci_info.score = 90 * tan(1.5637541897 * wl);
+    } else if (score_type == "centipawn_2019") {
+      uci_info.score = 295 * wl / (1 - 0.976953126 * std::pow(wl, 14));
+    } else if (score_type == "centipawn_2018") {
+      uci_info.score = 290.680623072 * tan(1.548090806 * wl);
+    } else if (score_type == "win_percentage") {
+      uci_info.score = wl * 5000 + 5000;
+    } else if (score_type == "Q") {
+      uci_info.score = q * 10000;
+    } else if (score_type == "W-L") {
+      uci_info.score = wl * 10000;
+    } else if (score_type == "WDL_mu") {
+      // Reports the WDL mu value whenever it is reasonable, and defaults to
+      // centipawn otherwise.
+      const float centipawn_fallback_threshold = 0.996f;
+      float centipawn_score = 45 * tan(1.56728071628 * wl);
+      uci_info.score =
+          backend_attributes_.has_wdl && mu_uci != 0.0f &&
+                  std::abs(wl) + d < centipawn_fallback_threshold &&
+                  (std::abs(mu_uci) < 1.0f ||
+                   std::abs(centipawn_score) < std::abs(100 * mu_uci))
+              ? 100 * mu_uci
+              : centipawn_score;
+    }
+
+    auto wdl_w =
+        std::max(0, static_cast<int>(std::round(500.0 * (1.0 + wl - d))));
+    auto wdl_l =
+        std::max(0, static_cast<int>(std::round(500.0 * (1.0 - wl - d))));
+    // Using 1000-w-l so that W+D+L add up to 1000.0.
+    auto wdl_d = 1000 - wdl_w - wdl_l;
+    if (wdl_d < 0) {
+      wdl_w = std::min(1000, std::max(0, wdl_w + wdl_d / 2));
+      wdl_l = 1000 - wdl_w;
+      wdl_d = 0;
+    }
+    uci_info.wdl = ThinkingInfo::WDL{wdl_w, wdl_d, wdl_l};
+    if (backend_attributes_.has_mlh) {
+      uci_info.moves_left = static_cast<int>(
+          (1.0f + edge.GetM(1.0f + root_node_->GetM())) / 2.0f);
+    }
+    if (max_pv > 1) uci_info.multipv = multipv;
+    if (per_pv_counters) uci_info.nodes = edge.GetN();
+    bool flip = played_history_.IsBlackToMove();
+    int depth = 0;
+    auto history = played_history_;
+    for (auto iter = edge; iter;
+         iter = GetBestChildNoTemperature(iter.node(), depth), flip = !flip) {
+      uci_info.pv.push_back(iter.GetMove(flip));
+      history.Append(iter.GetMove());
+      // Last edge was dangling or a draw by repetition, cannot continue.
+      if (!iter.node() || history.Last().GetRepetitions() >= 2) break;
+      depth += 1;
+    }
+  }
+
+  if (!uci_infos.empty()) last_outputted_uci_info_ = uci_infos.front();
+  if (current_best_edge_ && !edges.empty()) {
+    last_outputted_info_edge_ = current_best_edge_.edge();
+  }
+
+  uci_responder_->OutputThinkingInfo(&uci_infos);
+}
+
+// Decides whether anything important changed in stats and new info should be
+// shown to a user.
+void Search::MaybeOutputInfo(const classic::IterationStats& stats) {
+  SharedMutex::Lock lock(nodes_mutex_);
+  Mutex::Lock counters_lock(counters_mutex_);
+  if (!bestmove_is_sent_ && current_best_edge_ &&
+      (current_best_edge_.edge() != last_outputted_info_edge_ ||
+       last_outputted_uci_info_.depth !=
+           static_cast<int>(cum_depth_ /
+                            (total_playouts_ ? total_playouts_ : 1)) ||
+       last_outputted_uci_info_.seldepth != max_depth_ ||
+       last_outputted_uci_info_.time + kUciInfoMinimumFrequencyMs <
+           GetTimeSinceStart())) {
+    SendUciInfo(stats);
+    if (params_.GetLogLiveStats()) {
+      SendMovesStats();
+    }
+    if (stop_.load(std::memory_order_acquire) && !ok_to_respond_bestmove_) {
+      std::vector<ThinkingInfo> info(1);
+      info.back().comment =
+          "WARNING: Search has reached limit and does not make any progress.";
+      uci_responder_->OutputThinkingInfo(&info);
+    }
+  }
+}
+
+int64_t Search::GetTimeSinceStart() const {
+  return std::chrono::duration_cast<std::chrono::milliseconds>(
+             std::chrono::steady_clock::now() - start_time_)
+      .count();
+}
+
+int64_t Search::GetTimeSinceFirstBatch() const {
+  if (!nps_start_time_) return 0;
+  return std::chrono::duration_cast<std::chrono::milliseconds>(
+             std::chrono::steady_clock::now() - *nps_start_time_)
+      .count();
+}
+
+void Search::RecordNPSStartTime() {
+  if (nps_start_time_) return;
+  nps_start_time_ = std::chrono::steady_clock::now();
+}
+
+// Root is depth 0, i.e. even depth.
+float Search::GetDrawScore(bool is_odd_depth) const {
+  return (is_odd_depth == played_history_.IsBlackToMove()
+              ? params_.GetDrawScore()
+              : -params_.GetDrawScore());
+}
+
+namespace {
+inline float GetFpu(const SearchParams& params, const Node* node, bool is_root_node,
+                    float draw_score) {
+  const auto value = params.GetFpuValue(is_root_node);
+  return params.GetFpuAbsolute(is_root_node)
+             ? value
+             : -node->GetQ(-draw_score) -
+                   value * std::sqrt(node->GetVisitedPolicy());
+}
+
+// Faster version for if visited_policy is readily available already.
+inline float GetFpu(const SearchParams& params, const Node* node, bool is_root_node,
+                    float draw_score, float visited_pol) {
+  const auto value = params.GetFpuValue(is_root_node);
+  return params.GetFpuAbsolute(is_root_node)
+             ? value
+             : -node->GetQ(-draw_score) - value * std::sqrt(visited_pol);
+}
+
+inline float ComputeCpuct(const SearchParams& params, uint32_t N,
+                          bool is_root_node) {
+  const float init = params.GetCpuct(is_root_node);
+  const float k = params.GetCpuctFactor(is_root_node);
+  const float base = params.GetCpuctBase(is_root_node);
+  return init + (k ? k * FastLog((N + base) / base) : 0.0f);
+}
+}  // namespace
+
+// Ignore the last tuple element when sorting in GetVerboseStats
+static bool operator<(const EdgeAndNode&, const EdgeAndNode&) { return false; }
+
+std::vector<std::string> Search::GetVerboseStats(
+    const Node* node, std::optional<Move> move_to_node) const {
+  const bool is_root = (node == root_node_);
+  const bool is_odd_depth = !is_root;
+  const bool is_black_to_move = (played_history_.IsBlackToMove() == is_root);
+  const float draw_score = GetDrawScore(is_odd_depth);
+  const float fpu = GetFpu(params_, node, is_root, draw_score);
+  const float cpuct = ComputeCpuct(params_, node->GetTotalVisits(), is_root);
+  const float U_coeff =
+      cpuct * std::sqrt(std::max(node->GetChildrenVisits(), 1u));
+  std::vector<std::tuple<uint32_t, float, EdgeAndNode>> edges;
+  edges.reserve(node->GetNumEdges());
+  for (const auto& edge : node->Edges()) {
+    edges.emplace_back(edge.GetN(),
+                       edge.GetQ(fpu, draw_score) + edge.GetU(U_coeff),
+                       edge);
+  }
+  std::sort(edges.begin(), edges.end());
+
+  auto print = [](auto* oss, auto pre, auto v, auto post, auto w, int p = 0) {
+    *oss << pre << std::setw(w) << std::setprecision(p) << v << post;
+  };
+  auto print_head = [&](auto* oss, auto label, int i, auto n, auto f, auto p) {
+    *oss << std::fixed;
+    print(oss, "", label, " ", 5);
+    print(oss, "(", i, ") ", 4);
+    *oss << std::right;
+    print(oss, "N: ", n, " ", 7);
+    print(oss, "(+", f, ") ", 2);
+    print(oss, "(P: ", p * 100, "%) ", 5, p >= 0.99995f ? 1 : 2);
+  };
+  auto print_stats = [&](auto* oss, const auto* n) {
+    const auto sign = n == node ? -1 : 1;
+    if (n) {
+      auto wl = sign * n->GetWL();
+      auto d = n->GetD();
+      auto is_perspective = ((contempt_mode_ == ContemptMode::BLACK) ==
+                             played_history_.IsBlackToMove())
+                                ? 1.0f
+                                : -1.0f;
+      WDLRescale(
+          wl, d, params_.GetWDLRescaleRatio(),
+          contempt_mode_ == ContemptMode::NONE
+              ? 0
+              : params_.GetWDLRescaleDiff() * params_.GetWDLEvalObjectivity(),
+          is_perspective, true, params_.GetWDLMaxS());
+      print(oss, "(WL: ", wl, ") ", 8, 5);
+      print(oss, "(D: ", d, ") ", 5, 3);
+      print(oss, "(M: ", n->GetM(), ") ", 4, 1);
+      print(oss, "(Q: ", wl + draw_score * d, ") ", 8, 5);
+    } else {
+      *oss << "(WL:  -.-----) (D: -.---) (M:  -.-) ";
+      print(oss, "(Q: ", fpu, ") ", 8, 5);
+    }
+  };
+  auto print_tail = [&](auto* oss, const auto* n, bool is_edge) {
+    const auto sign = n == node ? -1 : 1;
+    std::optional<float> v;
+    if (n && n->IsTerminal()) {
+      v = n->GetQ(sign * draw_score);
+    } else if (n) {
+      auto history = played_history_;
+      if (move_to_node) {
+        history.Append(*move_to_node);
+      }
+      if (is_edge) {
+        history.Append(n->GetMove());
+      }
+      std::optional<EvalResult> nneval = backend_->GetCachedEvaluation(
+          EvalPosition{history.GetPositions(), {}});
+      if (nneval) v = -nneval->q;
+    }
+    if (v) {
+      print(oss, "(V: ", sign * *v, ") ", 7, 4);
+    } else {
+      *oss << "(V:  -.----) ";
+    }
+
+    if (n) {
+      auto [lo, up] = n->GetBounds();
+      if (sign == -1) {
+        lo = -lo;
+        up = -up;
+        std::swap(lo, up);
+      }
+      *oss << (lo == up                                                ? "(T) "
+               : lo == GameResult::DRAW && up == GameResult::WHITE_WON ? "(W) "
+               : lo == GameResult::BLACK_WON && up == GameResult::DRAW ? "(L) "
+                                                                       : "");
+    }
+  };
+
+  std::vector<std::string> infos;
+  const auto m_evaluator =
+      backend_attributes_.has_mlh ? MEvaluator(params_, node) : MEvaluator();
+  for (const auto& edge_tuple : edges) {
+    const auto& edge = std::get<2>(edge_tuple);
+    float Q = edge.GetQ(fpu, draw_score);
+    float M = m_evaluator.GetMUtility(edge, Q);
+    std::ostringstream oss;
+    oss << std::left;
+    // TODO: should this be displaying transformed index?
+    print_head(&oss, edge.GetMove(is_black_to_move).ToString(true),
+               MoveToNNIndex(edge.GetMove(), 0), edge.GetN(),
+               edge.GetNInFlight(), edge.GetP());
+    print_stats(&oss, edge.node());
+    print(&oss, "(U: ", edge.GetU(U_coeff), ") ", 6, 5);
+    print(&oss, "(S: ", Q + edge.GetU(U_coeff) + M, ") ", 8, 5);
+    print_tail(&oss, edge.node(), true);
+    infos.emplace_back(oss.str());
+  }
+
+  // Include stats about the node in similar format to its children above.
+  std::ostringstream oss;
+  print_head(&oss, "node ", node->GetNumEdges(), node->GetN(),
+             node->GetNInFlight(), node->GetVisitedPolicy());
+  print_stats(&oss, node);
+  print_tail(&oss, node, false);
+  infos.emplace_back(oss.str());
+  return infos;
+}
+
+void Search::SendMovesStats() const REQUIRES(counters_mutex_) {
+  auto move_stats = GetVerboseStats(root_node_, std::nullopt);
+
+  if (params_.GetVerboseStats()) {
+    std::vector<ThinkingInfo> infos;
+    std::transform(move_stats.begin(), move_stats.end(),
+                   std::back_inserter(infos), [](const std::string& line) {
+                     ThinkingInfo info;
+                     info.comment = line;
+                     return info;
+                   });
+    uci_responder_->OutputThinkingInfo(&infos);
+  } else {
+    LOGFILE << "=== Move stats:";
+    for (const auto& line : move_stats) LOGFILE << line;
+  }
+  for (auto& edge : root_node_->Edges()) {
+    if (!(edge.GetMove(played_history_.IsBlackToMove()) == final_bestmove_)) {
+      continue;
+    }
+    if (edge.HasNode()) {
+      LOGFILE << "--- Opponent moves after: " << final_bestmove_.ToString(true);
+      for (const auto& line : GetVerboseStats(edge.node(), edge.GetMove())) {
+        LOGFILE << line;
+      }
+    }
+  }
+}
+
+void Search::MaybeTriggerStop(const classic::IterationStats& stats,
+                              classic::StoppersHints* hints) {
+  hints->Reset();
+  if (params_.GetNpsLimit() > 0) {
+    hints->UpdateEstimatedNps(params_.GetNpsLimit());
+  }
+  SharedMutex::Lock nodes_lock(nodes_mutex_);
+  Mutex::Lock lock(counters_mutex_);
+  // Already responded bestmove, nothing to do here.
+  if (bestmove_is_sent_) return;
+  // Don't stop when the root node is not yet expanded.
+  if (stats.total_nodes == 0) return;
+
+  if (!stop_.load(std::memory_order_acquire)) {
+    const float delay = params_.GetGarbageCollectionDelay() / 100.0f;
+    if (stopper_->ShouldStop(stats, hints)) {
+      FireStopInternal();
+    } else if (!gc_started_ &&
+        stats.time_since_movestart > delay *
+        (stats.time_since_movestart + hints->GetEstimatedRemainingTimeMs())) {
+      NodeGarbageCollector::Instance().Start();
+      gc_started_ = true;
+    }
+  }
+
+  // If we are the first to see that stop is needed.
+  if (stop_.load(std::memory_order_acquire) && ok_to_respond_bestmove_ &&
+      !bestmove_is_sent_) {
+    SendUciInfo(stats);
+    EnsureBestMoveKnown();
+    SendMovesStats();
+    BestMoveInfo info(final_bestmove_, final_pondermove_);
+    uci_responder_->OutputBestMove(&info);
+    stopper_->OnSearchDone(stats);
+    bestmove_is_sent_ = true;
+    current_best_edge_ = EdgeAndNode();
+    NodeGarbageCollector::Instance().Stop();
+  }
+}
+
+// Return the evaluation of the actual best child, regardless of temperature
+// settings. This differs from GetBestMove, which does obey any temperature
+// settings. So, somethimes, they may return results of different moves.
+Eval Search::GetBestEval(Move* move, bool* is_terminal) const {
+  SharedMutex::SharedLock lock(nodes_mutex_);
+  Mutex::Lock counters_lock(counters_mutex_);
+  float parent_wl = -root_node_->GetWL();
+  float parent_d = root_node_->GetD();
+  float parent_m = root_node_->GetM();
+  if (!root_node_->HasChildren()) return {parent_wl, parent_d, parent_m};
+  EdgeAndNode best_edge = GetBestChildNoTemperature(root_node_, 0);
+  if (move) *move = best_edge.GetMove(played_history_.IsBlackToMove());
+  if (is_terminal) *is_terminal = best_edge.IsTerminal();
+  return {best_edge.GetWL(parent_wl), best_edge.GetD(parent_d),
+          best_edge.GetM(parent_m - 1) + 1};
+}
+
+std::pair<Move, Move> Search::GetBestMove() {
+  SharedMutex::Lock lock(nodes_mutex_);
+  Mutex::Lock counters_lock(counters_mutex_);
+  EnsureBestMoveKnown();
+  return {final_bestmove_, final_pondermove_};
+}
+
+std::int64_t Search::GetTotalPlayouts() const {
+  SharedMutex::SharedLock lock(nodes_mutex_);
+  return total_playouts_;
+}
+
+void Search::ResetBestMove() {
+  SharedMutex::Lock nodes_lock(nodes_mutex_);
+  Mutex::Lock lock(counters_mutex_);
+  bool old_sent = bestmove_is_sent_;
+  bestmove_is_sent_ = false;
+  EnsureBestMoveKnown();
+  bestmove_is_sent_ = old_sent;
+}
+
+// Computes the best move, maybe with temperature (according to the settings).
+void Search::EnsureBestMoveKnown() REQUIRES(nodes_mutex_)
+    REQUIRES(counters_mutex_) {
+  if (bestmove_is_sent_) return;
+  if (root_node_->GetN() == 0) return;
+  if (!root_node_->HasChildren()) return;
+
+  float temperature = params_.GetTemperature();
+  const int cutoff_move = params_.GetTemperatureCutoffMove();
+  const int decay_delay_moves = params_.GetTempDecayDelayMoves();
+  const int decay_moves = params_.GetTempDecayMoves();
+  const int moves = played_history_.Last().GetGamePly() / 2;
+
+  if (cutoff_move && (moves + 1) >= cutoff_move) {
+    temperature = params_.GetTemperatureEndgame();
+  } else if (temperature && decay_moves) {
+    if (moves >= decay_delay_moves + decay_moves) {
+      temperature = 0.0;
+    } else if (moves >= decay_delay_moves) {
+      temperature *=
+          static_cast<float>(decay_delay_moves + decay_moves - moves) /
+          decay_moves;
+    }
+    // don't allow temperature to decay below endgame temperature
+    if (temperature < params_.GetTemperatureEndgame()) {
+      temperature = params_.GetTemperatureEndgame();
+    }
+  }
+
+  auto bestmove_edge = temperature
+                           ? GetBestRootChildWithTemperature(temperature)
+                           : GetBestChildNoTemperature(root_node_, 0);
+  final_bestmove_ = bestmove_edge.GetMove(played_history_.IsBlackToMove());
+
+  if (bestmove_edge.GetN() > 0 && bestmove_edge.node()->HasChildren()) {
+    final_pondermove_ = GetBestChildNoTemperature(bestmove_edge.node(), 1)
+                            .GetMove(!played_history_.IsBlackToMove());
+  }
+}
+
+// Returns @count children with most visits.
+std::vector<EdgeAndNode> Search::GetBestChildrenNoTemperature(Node* parent,
+                                                              int count,
+                                                              int depth) const {
+  // Even if Edges is populated at this point, its a race condition to access
+  // the node, so exit quickly.
+  if (parent->GetN() == 0) return {};
+  const bool is_odd_depth = (depth % 2) == 1;
+  const float draw_score = GetDrawScore(is_odd_depth);
+  // Best child is selected using the following criteria:
+  // * Prefer shorter terminal wins / avoid shorter terminal losses.
+  // * Largest number of playouts.
+  // * If two nodes have equal number:
+  //   * If that number is 0, the one with larger prior wins.
+  //   * If that number is larger than 0, the one with larger eval wins.
+  std::vector<EdgeAndNode> edges;
+  for (auto& edge : parent->Edges()) {
+    if (parent == root_node_ && !root_move_filter_.empty() &&
+        std::find(root_move_filter_.begin(), root_move_filter_.end(),
+                  edge.GetMove()) == root_move_filter_.end()) {
+      continue;
+    }
+    edges.push_back(edge);
+  }
+  const auto middle = (static_cast<int>(edges.size()) > count)
+                          ? edges.begin() + count
+                          : edges.end();
+  std::partial_sort(
+      edges.begin(), middle, edges.end(),
+      [draw_score](const auto& a, const auto& b) {
+        // The function returns "true" when a is preferred to b.
+
+        // Lists edge types from less desirable to more desirable.
+        enum EdgeRank {
+          kTerminalLoss,
+          kTablebaseLoss,
+          kNonTerminal,  // Non terminal or terminal draw.
+          kTablebaseWin,
+          kTerminalWin,
+        };
+
+        auto GetEdgeRank = [](const EdgeAndNode& edge) {
+          // This default isn't used as wl only checked for case edge is
+          // terminal.
+          const auto wl = edge.GetWL(0.0f);
+          // Not safe to access IsTerminal if GetN is 0.
+          if (edge.GetN() == 0 || !edge.IsTerminal() || !wl) {
+            return kNonTerminal;
+          }
+          if (edge.IsTbTerminal()) {
+            return wl < 0.0 ? kTablebaseLoss : kTablebaseWin;
+          }
+          return wl < 0.0 ? kTerminalLoss : kTerminalWin;
+        };
+
+        // If moves have different outcomes, prefer better outcome.
+        const auto a_rank = GetEdgeRank(a);
+        const auto b_rank = GetEdgeRank(b);
+        if (a_rank != b_rank) return a_rank > b_rank;
+
+        // If both are terminal draws, try to make it shorter.
+        // Not safe to access IsTerminal if GetN is 0.
+        if (a_rank == kNonTerminal && a.GetN() != 0 && b.GetN() != 0 &&
+            a.IsTerminal() && b.IsTerminal()) {
+          if (a.IsTbTerminal() != b.IsTbTerminal()) {
+            // Prefer non-tablebase draws.
+            return a.IsTbTerminal() < b.IsTbTerminal();
+          }
+          // Prefer shorter draws.
+          return a.GetM(0.0f) < b.GetM(0.0f);
+        }
+
+        // Neither is terminal, use standard rule.
+        if (a_rank == kNonTerminal) {
+          // Prefer largest playouts then eval then prior.
+          if (a.GetN() != b.GetN()) return a.GetN() > b.GetN();
+          // Default doesn't matter here so long as they are the same as either
+          // both are N==0 (thus we're comparing equal defaults) or N!=0 and
+          // default isn't used.
+          if (a.GetQ(0.0f, draw_score) != b.GetQ(0.0f, draw_score)) {
+            return a.GetQ(0.0f, draw_score) > b.GetQ(0.0f, draw_score);
+          }
+          return a.GetP() > b.GetP();
+        }
+
+        // Both variants are winning, prefer shortest win.
+        if (a_rank > kNonTerminal) {
+          return a.GetM(0.0f) < b.GetM(0.0f);
+        }
+
+        // Both variants are losing, prefer longest losses.
+        return a.GetM(0.0f) > b.GetM(0.0f);
+      });
+
+  if (count < static_cast<int>(edges.size())) {
+    edges.resize(count);
+  }
+  return edges;
+}
+
+// Returns a child with most visits.
+EdgeAndNode Search::GetBestChildNoTemperature(Node* parent, int depth) const {
+  auto res = GetBestChildrenNoTemperature(parent, 1, depth);
+  return res.empty() ? EdgeAndNode() : res.front();
+}
+
+// Returns a child of a root chosen according to weighted-by-temperature visit
+// count.
+EdgeAndNode Search::GetBestRootChildWithTemperature(float temperature) const {
+  // Root is at even depth.
+  const float draw_score = GetDrawScore(/* is_odd_depth= */ false);
+
+  std::vector<float> cumulative_sums;
+  float sum = 0.0;
+  float max_n = 0.0;
+  const float offset = params_.GetTemperatureVisitOffset();
+  float max_eval = -1.0f;
+  const float fpu =
+      GetFpu(params_, root_node_, /* is_root= */ true, draw_score);
+
+  for (auto& edge : root_node_->Edges()) {
+    if (!root_move_filter_.empty() &&
+        std::find(root_move_filter_.begin(), root_move_filter_.end(),
+                  edge.GetMove()) == root_move_filter_.end()) {
+      continue;
+    }
+    if (edge.GetN() + offset > max_n) {
+      max_n = edge.GetN() + offset;
+      max_eval = edge.GetQ(fpu, draw_score);
+    }
+  }
+
+  // TODO(crem) Simplify this code when samplers.h is merged.
+  const float min_eval =
+      max_eval - params_.GetTemperatureWinpctCutoff() / 50.0f;
+  for (auto& edge : root_node_->Edges()) {
+    if (!root_move_filter_.empty() &&
+        std::find(root_move_filter_.begin(), root_move_filter_.end(),
+                  edge.GetMove()) == root_move_filter_.end()) {
+      continue;
+    }
+    if (edge.GetQ(fpu, draw_score) < min_eval) continue;
+    sum += std::pow(
+        std::max(0.0f,
+                 (max_n <= 0.0f
+                      ? edge.GetP()
+                      : ((static_cast<float>(edge.GetN()) + offset) / max_n))),
+        1 / temperature);
+    cumulative_sums.push_back(sum);
+  }
+  assert(sum);
+
+  const float toss = Random::Get().GetFloat(cumulative_sums.back());
+  int idx =
+      std::lower_bound(cumulative_sums.begin(), cumulative_sums.end(), toss) -
+      cumulative_sums.begin();
+
+  for (auto& edge : root_node_->Edges()) {
+    if (!root_move_filter_.empty() &&
+        std::find(root_move_filter_.begin(), root_move_filter_.end(),
+                  edge.GetMove()) == root_move_filter_.end()) {
+      continue;
+    }
+    if (edge.GetQ(fpu, draw_score) < min_eval) continue;
+    if (idx-- == 0) return edge;
+  }
+  assert(false);
+  return {};
+}
+
+void Search::StartThreads(size_t how_many) {
+  Mutex::Lock lock(threads_mutex_);
+  if (how_many == 0 && threads_.size() == 0) {
+    how_many = backend_attributes_.suggested_num_search_threads +
+               !backend_attributes_.runs_on_cpu;
+  }
+  thread_count_.store(how_many, std::memory_order_release);
+  // First thread is a watchdog thread.
+  if (threads_.size() == 0) {
+    threads_.emplace_back([this]() { WatchdogThread(); });
+  }
+  // Start working threads.
+  for (size_t i = 0; i < how_many; i++) {
+    threads_.emplace_back([this]() {
+      SearchWorker worker(this, params_);
+      worker.RunBlocking();
+    });
+  }
+  LOGFILE << "Search started. "
+          << std::chrono::duration_cast<std::chrono::milliseconds>(
+                 std::chrono::steady_clock::now() - start_time_)
+                 .count()
+          << "ms already passed.";
+}
+
+void Search::RunBlocking(size_t threads) {
+  StartThreads(threads);
+  Wait();
+}
+
+bool Search::IsSearchActive() const {
+  return !stop_.load(std::memory_order_acquire);
+}
+
+void Search::PopulateCommonIterationStats(classic::IterationStats* stats) {
+  stats->time_since_movestart = GetTimeSinceStart();
+
+  SharedMutex::SharedLock nodes_lock(nodes_mutex_);
+  stats->time_since_first_batch = GetTimeSinceFirstBatch();
+  stats->total_nodes = total_playouts_ + initial_visits_;
+  stats->nodes_since_movestart = total_playouts_;
+  stats->batches_since_movestart = total_batches_;
+  stats->average_depth = cum_depth_ / (total_playouts_ ? total_playouts_ : 1);
+  stats->edge_n.clear();
+  stats->win_found = false;
+  stats->may_resign = true;
+  stats->num_losing_edges = 0;
+  stats->time_usage_hint_ = classic::IterationStats::TimeUsageHint::kNormal;
+  stats->mate_depth = std::numeric_limits<int>::max();
+
+  // If root node hasn't finished first visit, none of this code is safe.
+  if (root_node_->GetN() > 0) {
+    const auto draw_score = GetDrawScore(true);
+    const float fpu =
+        GetFpu(params_, root_node_, /* is_root_node */ true, draw_score);
+    float max_q_plus_m = -1000;
+    uint64_t max_n = 0;
+    bool max_n_has_max_q_plus_m = true;
+    const auto m_evaluator = backend_attributes_.has_mlh
+                                 ? MEvaluator(params_, root_node_)
+                                 : MEvaluator();
+    for (const auto& edge : root_node_->Edges()) {
+      const auto n = edge.GetN();
+      const auto q = edge.GetQ(fpu, draw_score);
+      const auto m = m_evaluator.GetMUtility(edge, q);
+      const auto q_plus_m = q + m;
+      stats->edge_n.push_back(n);
+      if (n > 0 && edge.IsTerminal() && edge.GetWL(0.0f) > 0.0f) {
+        stats->win_found = true;
+      }
+      if (n > 0 && edge.IsTerminal() && edge.GetWL(0.0f) < 0.0f) {
+        stats->num_losing_edges += 1;
+      }
+      if (n > 0 && edge.IsTerminal() && edge.GetWL(0.0f) == 1.0f &&
+          !edge.IsTbTerminal()) {
+        stats->mate_depth =
+            std::min(stats->mate_depth,
+                     static_cast<int>(std::round(edge.GetM(0.0f))) / 2 + 1);
+      }
+
+      // If game is resignable, no need for moving quicker. This allows
+      // proving mate when losing anyway for better score output.
+      // Hardcoded resign threshold, because there is no available parameter.
+      if (n > 0 && q > -0.98f) {
+        stats->may_resign = false;
+      }
+      if (max_n < n) {
+        max_n = n;
+        max_n_has_max_q_plus_m = false;
+      }
+      if (max_q_plus_m <= q_plus_m) {
+        max_n_has_max_q_plus_m = (max_n == n);
+        max_q_plus_m = q_plus_m;
+      }
+    }
+    if (!max_n_has_max_q_plus_m) {
+      stats->time_usage_hint_ =
+          classic::IterationStats::TimeUsageHint::kNeedMoreTime;
+    }
+  }
+}
+
+void Search::WatchdogThread() {
+  LOGFILE << "Start a watchdog thread.";
+  classic::StoppersHints hints;
+  classic::IterationStats stats;
+  while (true) {
+    PopulateCommonIterationStats(&stats);
+    MaybeTriggerStop(stats, &hints);
+    MaybeOutputInfo(stats);
+
+    constexpr auto kMaxWaitTimeMs = 100;
+    constexpr auto kMinWaitTimeMs = 1;
+
+    Mutex::Lock lock(counters_mutex_);
+    // Only exit when bestmove is responded. It may happen that search threads
+    // already all exited, and we need at least one thread that can do that.
+    if (bestmove_is_sent_) break;
+
+    auto remaining_time = hints.GetEstimatedRemainingTimeMs();
+    if (remaining_time > kMaxWaitTimeMs) remaining_time = kMaxWaitTimeMs;
+    if (remaining_time < kMinWaitTimeMs) remaining_time = kMinWaitTimeMs;
+    // There is no real need to have max wait time, and sometimes it's fine
+    // to wait without timeout at all (e.g. in `go nodes` mode), but we
+    // still limit wait time for exotic cases like when pc goes to sleep
+    // mode during thinking.
+    // Minimum wait time is there to prevent busy wait and other threads
+    // starvation.
+    watchdog_cv_.wait_for(
+        lock.get_raw(), std::chrono::milliseconds(remaining_time),
+        [this]() { return stop_.load(std::memory_order_acquire); });
+  }
+  LOGFILE << "End a watchdog thread.";
+}
+
+void Search::FireStopInternal() {
+  stop_.store(true, std::memory_order_release);
+  watchdog_cv_.notify_all();
+}
+
+void Search::Stop() {
+  NodeGarbageCollector::Instance().Stop();
+  Mutex::Lock lock(counters_mutex_);
+  ok_to_respond_bestmove_ = true;
+  FireStopInternal();
+  LOGFILE << "Stopping search due to `stop` uci command.";
+}
+
+void Search::Abort() {
+  NodeGarbageCollector::Instance().Abort();
+  Mutex::Lock lock(counters_mutex_);
+  if (!stop_.load(std::memory_order_acquire) ||
+      (!bestmove_is_sent_ && !ok_to_respond_bestmove_)) {
+    bestmove_is_sent_ = true;
+    FireStopInternal();
+  }
+  LOGFILE << "Aborting search, if it is still active.";
+}
+
+void Search::Wait() {
+  NodeGarbageCollector::Instance().Wait();
+  Mutex::Lock lock(threads_mutex_);
+  bool active_threads = !threads_.empty();
+  while (!threads_.empty()) {
+    threads_.back().join();
+    threads_.pop_back();
+  }
+  if (active_threads) {
+    SharedMutex::Lock lock(nodes_mutex_);
+
+    assert(root_node_->ZeroNInFlight());
+  }
+  LOGFILE << "Search threads cleaned.";
+}
+
+void SearchWorker::CancelCollisions() {
+  for (auto& entry : minibatch_) {
+    if (!entry.IsCollision()) continue;
+    auto path = entry.path;
+    for (auto it = ++(path.crbegin()); it != path.crend(); ++it) {
+      std::get<0>(*it)->CancelScoreUpdate(entry.multivisit);
+    }
+  }
+}
+
+Search::~Search() {
+  Abort();
+  Wait();
+  LOGFILE << "Search destroyed.";
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// SearchWorker
+//////////////////////////////////////////////////////////////////////////////
+
+SearchWorker::~SearchWorker()
+{
+  {
+    // Tasks must be completed before destructor. If a gather tasks is running,
+    // it can increment task_count_ which would break the exit state.
+    assert(IsTasksCompleted(task_count_, completed_tasks_));
+    task_count_.fetch_or(kTaskCountSuspend, std::memory_order_release);
+    Mutex::Lock lock(picking_tasks_mutex_);
+    exiting_ = true;
+    task_added_.notify_all();
+  }
+  for (size_t i = 0; i < task_threads_.size(); i++) {
+    task_threads_[i].join();
+  }
+  LOGFILE << "Search worker destroyed.";
+}
+
+std::tuple<SearchWorker::PickTask*, int, int> SearchWorker::PickTaskToProcess() {
+  auto [packed_value, nta, tc] = ReadTaskCount(task_count_);
+
+  // Check if tasks are queued and try increment taken count.
+  while (nta < tc &&
+      !task_count_.compare_exchange_weak(packed_value, packed_value + kTasksTakenOne,
+                                         std::memory_order_acq_rel)) {
+    // Queue had tasks but another worker increment taken. We check
+    // if new work was added to the queue. Then we try to increment
+    // taken again.
+    std::tie(packed_value, nta, tc) = ReadTaskCount(packed_value);
+  }
+  // We incremented taken if nta and tc are different
+  if (nta < tc) {
+    return {picking_tasks_.data() + nta, nta, tc};
+  }
+  return {nullptr, nta, tc};
+}
+
+void SearchWorker::ProcessTask(PickTask* task, int id,
+                               std::vector<NodeToProcess>* receiver,
+                               TaskWorkspace* workspace) {
+  switch (task->task_type) {
+    case PickTask::kGathering: {
+      PickNodesToExtendTask(task->start_path, task->collision_limit,
+                            task->history, receiver,
+                            workspace);
+      break;
+    }
+    case PickTask::kProcessing: {
+      ProcessPickedTask(task->start_idx, task->end_idx);
+      break;
+    }
+  }
+  picking_tasks_.data()[id].complete = true;
+  completed_tasks_.fetch_add(1, std::memory_order_acq_rel);
+}
+
+void SearchWorker::RunTasks(int tid) {
+  while (true) {
+    PickTask* task = nullptr;
+    int id = 0;
+    int tc = 0;
+    {
+      int spins = 0;
+      while (true) {
+        std::tie(task, id, tc) = PickTaskToProcess();
+        if (task) {
+          break;
+        } else if (tc != -1) {
+          spins++;
+          if (spins >= 512) {
+            std::this_thread::yield();
+            spins = 0;
+          } else {
+            SpinloopPause();
+          }
+          continue;
+        }
+        spins = 0;
+        // Looks like sleep time.
+        Mutex::Lock lock(picking_tasks_mutex_);
+        // Refresh them now we have the lock.
+        int tc, nta;
+        std::tie(std::ignore, std::ignore, tc) = ReadTaskCount(task_count_);
+        if (tc != -1) continue;
+        if (exiting_) return;
+        task_added_.wait(lock.get_raw());
+        std::tie(std::ignore, nta, tc) = ReadTaskCount(task_count_);
+        if (nta >= tc && exiting_) return;
+      }
+    }
+    if (task != nullptr) {
+      ProcessTask(task, id, &(task->results), &(task_workspaces_[tid]));
+    }
+  }
+}
+
+void SearchWorker::ExecuteOneIteration() {
+  // 1. Initialize internal structures.
+  InitializeIteration();
+
+  if (params_.GetMaxConcurrentSearchers() != 0) {
+    std::unique_ptr<SpinHelper> spin_helper;
+    if (params_.GetSearchSpinBackoff()) {
+      spin_helper = std::make_unique<ExponentialBackoffSpinHelper>();
+    } else {
+      // This is a hard spin lock to reduce latency but at the expense of busy
+      // wait cpu usage. If search worker count is large, this is probably a
+      // bad idea.
+      spin_helper = std::make_unique<SpinHelper>();
+    }
+
+    while (true) {
+      // If search is stopped, we've not gathered or done anything and we don't
+      // want to, so we can safely skip all below. But make sure we have done
+      // at least one iteration.
+      if (search_->stop_.load(std::memory_order_acquire) &&
+          search_->GetTotalPlayouts() + search_->initial_visits_ > 0) {
+        return;
+      }
+
+      int available =
+          search_->pending_searchers_.load(std::memory_order_acquire);
+      if (available == 0) {
+        spin_helper->Wait();
+        continue;
+      }
+
+      if (search_->pending_searchers_.compare_exchange_weak(
+              available, available - 1, std::memory_order_acq_rel)) {
+        break;
+      } else {
+        spin_helper->Backoff();
+      }
+    }
+  }
+
+  // 2. Gather minibatch.
+  GatherMinibatch();
+  assert(IsTasksCompleted(task_count_, completed_tasks_));
+  task_count_.fetch_or(kTaskCountSuspend, std::memory_order_release);
+  search_->backend_waiting_counter_.fetch_add(1, std::memory_order_relaxed);
+
+  if (params_.GetMaxConcurrentSearchers() != 0) {
+    search_->pending_searchers_.fetch_add(1, std::memory_order_acq_rel);
+  }
+
+  // 4. Run NN computation.
+  RunNNComputation();
+  search_->backend_waiting_counter_.fetch_add(-1, std::memory_order_relaxed);
+
+  // 5. Retrieve NN computations (and terminal values) into nodes.
+  FetchMinibatchResults();
+
+  // 6. Propagate the new nodes' information to all their parents in the tree.
+  DoBackupUpdate();
+
+  // 7. Update the Search's status and progress information.
+  UpdateCounters();
+
+  // If required, waste time to limit nps.
+  if (params_.GetNpsLimit() > 0 && iteration_stats_.time_since_first_batch) {
+    while (search_->IsSearchActive()) {
+      // GetTimeSinceFirstBatch is set only once. We check iteration_stats_ to
+      // know if it was set and later read inside nodes_mutex_.
+      int64_t time_since_first_batch_ms = search_->GetTimeSinceFirstBatch();
+      auto nps = search_->GetTotalPlayouts() * 1e3f / time_since_first_batch_ms;
+      if (nps > params_.GetNpsLimit()) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(1));
+      } else {
+        break;
+      }
+    }
+  }
+}
+
+// 1. Initialize internal structures.
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+void SearchWorker::InitializeIteration() {
+  LCTRACE_FUNCTION_SCOPE;
+  // Free the old computation before allocating a new one. This works better
+  // when backend caches buffer allocations between computations.
+  computation_.reset();
+  computation_ = search_->backend_->CreateComputation();
+  minibatch_.clear();
+  minibatch_.reserve(2 * target_minibatch_size_);
+}
+
+// 2. Gather minibatch.
+// ~~~~~~~~~~~~~~~~~~~~
+namespace {
+int Mix(int high, int low, float ratio) {
+  return static_cast<int>(std::round(static_cast<float>(low) +
+                                     static_cast<float>(high - low) * ratio));
+}
+
+int CalculateCollisionsLeft(int64_t nodes, const SearchParams& params) {
+  // End checked first
+  if (nodes >= params.GetMaxCollisionVisitsScalingEnd()) {
+    return params.GetMaxCollisionVisits();
+  }
+  if (nodes <= params.GetMaxCollisionVisitsScalingStart()) {
+    return 1;
+  }
+  return Mix(params.GetMaxCollisionVisits(), 1,
+             std::pow((static_cast<float>(nodes) -
+                       params.GetMaxCollisionVisitsScalingStart()) /
+                          (params.GetMaxCollisionVisitsScalingEnd() -
+                           params.GetMaxCollisionVisitsScalingStart()),
+                      params.GetMaxCollisionVisitsScalingPower()));
+}
+}  // namespace
+
+void SearchWorker::GatherMinibatch() {
+  LCTRACE_FUNCTION_SCOPE;
+  // Total number of nodes to process.
+  int minibatch_size = 0;
+  int cur_n = 0;
+
+  // Collision use atomic operations. We can cancel them outside the lock.
+  struct CollisionsManager {
+    SearchWorker& worker;
+    CollisionsManager(SearchWorker& worker) : worker(worker) {
+    }
+    ~CollisionsManager() {
+      worker.CancelCollisions();
+    }
+  } cancel_collisions_object(*this);
+  // We take the nodes_mutex_ only once to avoid bouncing between this thread
+  // and a thread returning from RunNNComputation.
+  SharedMutex::Lock lock(search_->nodes_mutex_);
+  cur_n = search_->root_node_->GetN();
+  // TODO: GetEstimatedRemainingPlayouts has already had smart pruning factor
+  // applied, which doesn't clearly make sense to include here...
+  int64_t remaining_n =
+      latest_time_manager_hints_.GetEstimatedRemainingPlayouts();
+  int collisions_left = CalculateCollisionsLeft(
+      std::min(static_cast<int64_t>(cur_n), remaining_n), params_);
+
+  // Number of nodes processed out of order.
+  number_out_of_order_ = 0;
+
+  int thread_count = search_->thread_count_.load(std::memory_order_acquire);
+
+  absl::Cleanup record_batch_start_time = [&] {
+    if (minibatch_size) search_->RecordNPSStartTime();
+  };
+
+  // Gather nodes to process in the current batch.
+  // If we had too many nodes out of order, also interrupt the iteration so
+  // that search can exit.
+  while (minibatch_size < target_minibatch_size_ &&
+         number_out_of_order_ < max_out_of_order_) {
+    // If there's something to process without touching slow neural net, do it.
+    if (minibatch_size > 0 && computation_->UsedBatchSize() == 0) return;
+
+    // If there is backend work to be done, and the backend is idle - exit
+    // immediately.
+    // Only do this fancy work if there are multiple threads as otherwise we
+    // early exit from every batch since there is never another search thread to
+    // be keeping the backend busy. Which would mean that threads=1 has a
+    // massive nps drop.
+    if (thread_count > 1 && minibatch_size > 0 &&
+        static_cast<int>(computation_->UsedBatchSize()) >
+            params_.GetIdlingMinimumWork() &&
+        thread_count - search_->backend_waiting_counter_.load(
+                           std::memory_order_relaxed) >
+            params_.GetThreadIdlingThreshold()) {
+      return;
+    }
+
+    int new_start = static_cast<int>(minibatch_.size());
+
+    PickNodesToExtend(
+        std::min({collisions_left, target_minibatch_size_ - minibatch_size,
+                  max_out_of_order_ - number_out_of_order_}));
+
+    // Count the non-collisions.
+    int non_collisions = 0;
+    for (int i = new_start; i < static_cast<int>(minibatch_.size()); i++) {
+      auto& picked_node = minibatch_[i];
+      if (picked_node.IsCollision()) {
+        continue;
+      }
+      ++non_collisions;
+      ++minibatch_size;
+    }
+
+    {
+
+      bool needs_wait = false;
+      int ppt_start = new_start;
+      if (task_workers_ > 0 &&
+          non_collisions >= params_.GetMinimumWorkSizeForProcessing()) {
+        const int num_tasks = std::clamp(
+            non_collisions / params_.GetMinimumWorkPerTaskForProcessing(), 2,
+            task_workers_ + 1);
+        // Round down, left overs can go to main thread so it waits less.
+        int per_worker = non_collisions / num_tasks;
+        needs_wait = true;
+        ResetTasks();
+        int found = 0;
+        for (int i = new_start; i < static_cast<int>(minibatch_.size()); i++) {
+          auto& picked_node = minibatch_[i];
+          if (picked_node.IsCollision()) {
+            continue;
+          }
+          ++found;
+          if (found == per_worker) {
+            picking_tasks_.emplace_back(ppt_start, i + 1);
+            task_count_.fetch_add(1, std::memory_order_acq_rel);
+            ppt_start = i + 1;
+            found = 0;
+            if (picking_tasks_.size() == static_cast<size_t>(num_tasks - 1)) {
+              break;
+            }
+          }
+        }
+      }
+      ProcessPickedTask(ppt_start, static_cast<int>(minibatch_.size()));
+      if (needs_wait) {
+        WaitForTasks();
+      }
+    }
+    bool some_ooo = false;
+    for (int i = static_cast<int>(minibatch_.size()) - 1; i >= new_start; i--) {
+      if (minibatch_[i].ooo_completed) {
+        some_ooo = true;
+        break;
+      }
+    }
+    if (some_ooo) {
+      for (int i = static_cast<int>(minibatch_.size()) - 1; i >= new_start;
+           i--) {
+        // If there was any OOO, revert 'all' new collisions - it isn't possible
+        // to identify exactly which ones are afterwards and only prune those.
+        // This may remove too many items, but hopefully most of the time they
+        // will just be added back in the same in the next gather.
+        if (minibatch_[i].IsCollision()) {
+          for (auto it = ++(minibatch_[i].path.crbegin());
+               it != minibatch_[i].path.crend(); ++it) {
+            std::get<0>(*it)->CancelScoreUpdate(minibatch_[i].multivisit);
+          }
+          minibatch_.erase(minibatch_.begin() + i);
+        } else if (minibatch_[i].ooo_completed) {
+          FetchSingleNodeResult(&minibatch_[i]);
+          DoBackupUpdateSingleNode(minibatch_[i]);
+          minibatch_.erase(minibatch_.begin() + i);
+          --minibatch_size;
+          ++number_out_of_order_;
+        }
+      }
+    }
+
+    // Check for stop at the end so we have at least one node.
+    for (size_t i = new_start; i < minibatch_.size(); i++) {
+      auto& picked_node = minibatch_[i];
+
+      if (picked_node.IsCollision()) {
+        // Check to see if we can upsize the collision to exit sooner.
+        if (picked_node.maxvisit > 0 &&
+            collisions_left > picked_node.multivisit) {
+          int extra = std::min(picked_node.maxvisit, collisions_left) -
+                      picked_node.multivisit;
+          picked_node.multivisit += extra;
+          for (auto it = ++(picked_node.path.crbegin());
+               it != picked_node.path.crend(); ++it) {
+            std::get<0>(*it)->IncrementNInFlight(extra);
+          }
+        }
+        if ((collisions_left -= picked_node.multivisit) <= 0) return;
+        if (search_->stop_.load(std::memory_order_acquire)) return;
+      }
+    }
+  }
+}
+
+void SearchWorker::ProcessPickedTask(int start_idx, int end_idx)
+    REQUIRES(search_->nodes_mutex_) {
+  for (int i = start_idx; i < end_idx; i++) {
+    auto& picked_node = minibatch_[i];
+    if (picked_node.IsCollision()) continue;
+    // If node is a collision, known as a terminal (win/loss/draw according to
+    // the rules of the game) or has a low node, it means that we have already
+    // visited this node before and can't extend it.
+    if (picked_node.IsExtendable()) {
+      // Node was never visited, extend it.
+      ExtendNode(picked_node);
+    }
+
+    picked_node.ooo_completed =
+        params_.GetOutOfOrderEval() && picked_node.CanEvalOutOfOrder();
+  }
+}
+
+#define MAX_TASKS 256
+
+void SearchWorker::ResetTasks() {
+  // Tasks must be completed before reset.
+  assert(IsTasksCompleted(task_count_, completed_tasks_));
+  task_count_.store(0, std::memory_order_release);
+  completed_tasks_.store(0, std::memory_order_release);
+  picking_tasks_.clear();
+  // Reserve because resizing breaks pointers held by the task threads.
+  picking_tasks_.reserve(MAX_TASKS);
+}
+
+int SearchWorker::WaitForTasks() REQUIRES(search_->nodes_mutex_) {
+  // Process any outstanding tasks before checking if compelted. This avoids a
+  // long polling loop when PickNodesToExtend scheduled many tasks.
+  while (true) {
+    PickTask* task = nullptr;
+    int id = 0;
+    std::tie(task, id, std::ignore) = PickTaskToProcess();
+    if (task == nullptr) {
+      break;
+    }
+    ProcessTask(task, id, &minibatch_, &main_workspace_);
+  }
+  // Spin lock, other tasks should be done soon.
+  while (true) {
+    int completed = completed_tasks_.load(std::memory_order_acquire);
+    int todo, nta;
+    std::tie(std::ignore, nta, todo) = ReadTaskCount(task_count_);
+    std::ignore = nta;
+    assert(nta <= todo);
+    if (todo == completed) return completed;
+    SpinloopPause();
+  }
+}
+
+void SearchWorker::PickNodesToExtend(int collision_limit)
+    REQUIRES(search_->nodes_mutex_) {
+  ResetTasks();
+  if (task_workers_ > 0 && !search_->backend_attributes_.runs_on_cpu) {
+    // While nothing is ready yet - wake the task runners so they are ready to
+    // receive quickly.
+    Mutex::Lock lock(picking_tasks_mutex_);
+    task_added_.notify_all();
+  }
+  std::vector<Move> empty_movelist;
+  history_.Trim(search_->played_history_.GetLength());
+  PickNodesToExtendTask({std::make_tuple(search_->root_node_, 0, 0)},
+                        collision_limit, history_, &minibatch_,
+                        &main_workspace_);
+
+  WaitForTasks();
+  for (int i = 0; i < static_cast<int>(picking_tasks_.size()); i++) {
+    for (int j = 0; j < static_cast<int>(picking_tasks_[i].results.size());
+         j++) {
+      minibatch_.emplace_back(std::move(picking_tasks_[i].results[j]));
+    }
+  }
+}
+
+// Check if the situation described by @depth under root and @position is a
+// safe two-fold or a draw by repetition and return the number of safe
+// repetitions and moves_left.
+// Depth starts with 0 at root, so number of plies in PV equals depth.
+std::pair<int, int> SearchWorker::GetRepetitions(int depth,
+                                                 const Position& position) {
+  const auto repetitions = position.GetRepetitions();
+
+  if (repetitions == 0) return {0, 0};
+
+  if (repetitions >= 2) return {repetitions, 0};
+
+  const auto plies = position.GetPliesSincePrevRepetition();
+  if (params_.GetTwoFoldDraws() && /*repetitions == 1 &&*/ depth >= 4 &&
+      depth >= plies) {
+    return {1, plies};
+  }
+
+  return {0, 0};
+}
+
+// Check if PickNodesToExtendTask should stop picking at this @node.
+bool SearchWorker::ShouldStopPickingHere(Node* node, bool is_root_node,
+                                         int repetitions) {
+  constexpr double wl_diff_limit = 0.01f;
+  constexpr float d_diff_limit = 0.01f;
+  constexpr float m_diff_limit = 2.0f;
+
+  if (node->GetN() == 0 || node->IsTerminal()) return true;
+
+  // Only stop at root when there is no other option.
+  assert(!is_root_node || node == search_->root_node_);
+  if (is_root_node) return false;
+
+  // Stop at draws by repetition.
+  if (repetitions >= 2) return true;
+
+  // Check if Node and LowNode differ significantly.
+  auto low_node = node->GetLowNode().get();
+  assert(low_node);
+
+  // Only known transpositions can differ.
+  if (!low_node->IsTransposition()) return false;
+
+  // LowNode is terminal when Node is not.
+  if (low_node->IsTerminal()) return true;
+
+  // Bounds differ (swap).
+  auto [low_node_lower, low_node_upper] = low_node->GetBounds();
+  auto [node_lower, node_upper] = node->GetBounds();
+  if (low_node_lower != -node_upper || low_node_upper != -node_lower)
+    return true;
+
+  // WL differs significantly (flip).
+  auto wl_diff = std::abs(low_node->GetWL() + node->GetWL());
+  if (wl_diff >= wl_diff_limit) return true;
+
+  // D differs significantly.
+  auto d_diff = std::abs(low_node->GetD() - node->GetD());
+  if (d_diff >= d_diff_limit) return true;
+
+  // M differs significantly (increment).
+  auto m_diff = std::abs(low_node->GetM() + 1 - node->GetM());
+  if (m_diff >= m_diff_limit) return true;
+
+  return false;
+}
+
+void SearchWorker::PickNodesToExtendTask(
+    const BackupPath& path, int collision_limit, PositionHistory& history,
+    std::vector<NodeToProcess>* receiver,
+    TaskWorkspace* workspace) NO_THREAD_SAFETY_ANALYSIS {
+  LCTRACE_FUNCTION_SCOPE;
+  assert(path.size() == (size_t)history.GetLength() -
+                            search_->played_history_.GetLength() + 1);
+
+  // TODO: Bring back pre-cached nodes created outside locks in a way that works
+  // with tasks.
+  // TODO: pre-reserve visits_to_perform for expected depth and likely maximum
+  // width. Maybe even do so outside of lock scope.
+  auto& vtp_buffer = workspace->vtp_buffer;
+  auto& visits_to_perform = workspace->visits_to_perform;
+  visits_to_perform.clear();
+  auto& vtp_last_filled = workspace->vtp_last_filled;
+  vtp_last_filled.clear();
+  auto& current_path = workspace->current_path;
+  current_path.clear();
+  auto& full_path = workspace->full_path;
+  full_path = path;
+  assert(full_path.size() > 0);
+  auto [node, repetitions, moves_left] = full_path.back();
+  // Sometimes receiver is reused, othertimes not, so only jump start if small.
+  if (receiver->capacity() < 30) {
+    receiver->reserve(receiver->size() + 30);
+  }
+
+  // This 1 is 'filled pre-emptively'.
+  std::array<float, 256> current_util;
+
+  // These 3 are 'filled on demand'.
+  std::array<float, 256> current_score;
+  std::array<int, 256> current_nstarted;
+  auto& cur_iters = workspace->cur_iters;
+
+  Node::Iterator best_edge;
+  Node::Iterator second_best_edge;
+  // Fetch the current best root node visits for possible smart pruning.
+  const int64_t best_node_n = search_->current_best_edge_.GetN();
+
+  int passed_off = 0;
+  int completed_visits = 0;
+
+  bool is_root_node = node == search_->root_node_;
+  const float even_draw_score = search_->GetDrawScore(false);
+  const float odd_draw_score = search_->GetDrawScore(true);
+  const auto& root_move_filter = search_->root_move_filter_;
+  auto m_evaluator = moves_left_support_ ? MEvaluator(params_) : MEvaluator();
+
+  int max_limit = std::numeric_limits<int>::max();
+
+  current_path.push_back(-1);
+  while (current_path.size() > 0) {
+    assert(full_path.size() >= path.size());
+    // First prepare visits_to_perform.
+    if (current_path.back() == -1) {
+      // Need to do n visits, where n is either collision_limit, or comes from
+      // visits_to_perform for the current path.
+      int cur_limit = collision_limit;
+      if (current_path.size() > 1) {
+        cur_limit =
+            (*visits_to_perform.back())[current_path[current_path.size() - 2]];
+      }
+      // First check if node is terminal or not-expanded.  If either than create
+      // a collision of appropriate size and pop current_path.
+      if (ShouldStopPickingHere(node, is_root_node, repetitions)) {
+        if (is_root_node) {
+          // Root node is special - since its not reached from anywhere else, so
+          // it needs its own logic. Still need to create the collision to
+          // ensure the outer gather loop gives up.
+          if (node->TryStartScoreUpdate()) {
+            cur_limit -= 1;
+            minibatch_.push_back(
+                NodeToProcess::Visit(full_path, search_->played_history_));
+            completed_visits++;
+          }
+        }
+        // Visits are created elsewhere, just need the collisions here.
+        if (cur_limit > 0) {
+          int max_count = 0;
+          if (cur_limit == collision_limit && path.size() == 1 &&
+              max_limit > cur_limit) {
+            max_count = max_limit;
+          }
+          receiver->push_back(
+              NodeToProcess::Collision(full_path, cur_limit, max_count));
+          completed_visits += cur_limit;
+        }
+        history.Pop();
+        full_path.pop_back();
+        if (full_path.size() > 0) {
+          std::tie(node, repetitions, moves_left) = full_path.back();
+        } else {
+          node = nullptr;
+          repetitions = 0;
+        }
+        current_path.pop_back();
+        continue;
+      }
+      if (is_root_node) {
+        // Root node is again special - needs its n in flight updated separately
+        // as its not handled on the path to it, since there isn't one.
+        node->IncrementNInFlight(cur_limit);
+      }
+
+      // Create visits_to_perform new back entry for this level.
+      if (vtp_buffer.size() > 0) {
+        visits_to_perform.push_back(std::move(vtp_buffer.back()));
+        vtp_buffer.pop_back();
+      } else {
+        visits_to_perform.push_back(std::make_unique<std::array<int, 256>>());
+      }
+      vtp_last_filled.push_back(-1);
+
+      // Cache all constant UCT parameters.
+
+      int max_needed = node->GetNumEdges();
+      for (int i = 0; i < max_needed; i++) {
+        current_util[i] = std::numeric_limits<float>::lowest();
+      }
+      // Root depth is 1 here, while for GetDrawScore() it's 0-based, that's why
+      // the weirdness.
+      const float draw_score =
+          (full_path.size() % 2 == 0) ? odd_draw_score : even_draw_score;
+      m_evaluator.SetParent(node);
+      float visited_pol = 0.0f;
+      for (Node* child : node->VisitedNodes()) {
+        int index = child->Index();
+        visited_pol += child->GetP();
+        float q = child->GetQ(draw_score);
+        current_util[index] = q + m_evaluator.GetMUtility(child, q);
+      }
+      const float fpu =
+          GetFpu(params_, node, is_root_node, draw_score, visited_pol);
+      for (int i = 0; i < max_needed; i++) {
+        if (current_util[i] == std::numeric_limits<float>::lowest()) {
+          current_util[i] = fpu + m_evaluator.GetDefaultMUtility();
+        }
+      }
+
+      const float cpuct =
+          ComputeCpuct(params_, node->GetTotalVisits(), is_root_node);
+      const float puct_mult =
+          cpuct * std::sqrt(std::max(node->GetChildrenVisits(), 1u));
+      int cache_filled_idx = -1;
+      while (cur_limit > 0) {
+        // Perform UCT for current node.
+        float best = std::numeric_limits<float>::lowest();
+        int best_idx = -1;
+        float best_without_u = std::numeric_limits<float>::lowest();
+        float second_best = std::numeric_limits<float>::lowest();
+        bool can_exit = false;
+        best_edge.Reset();
+        for (int idx = 0; idx < max_needed; ++idx) {
+          if (idx > cache_filled_idx) {
+            if (idx == 0) {
+              cur_iters[idx] = node->Edges();
+            } else {
+              cur_iters[idx] = cur_iters[idx - 1];
+              ++cur_iters[idx];
+            }
+            current_nstarted[idx] = cur_iters[idx].GetNStarted();
+          }
+          int nstarted = current_nstarted[idx];
+          const float util = current_util[idx];
+          if (idx > cache_filled_idx) {
+            current_score[idx] =
+                cur_iters[idx].GetP() * puct_mult / (1 + nstarted) + util;
+            cache_filled_idx++;
+          }
+          if (is_root_node) {
+            // If there's no chance to catch up to the current best node with
+            // remaining playouts, don't consider it.
+            // best_move_node_ could have changed since best_node_n was
+            // retrieved. To ensure we have at least one node to expand, always
+            // include current best node.
+            if (cur_iters[idx] != search_->current_best_edge_ &&
+                latest_time_manager_hints_.GetEstimatedRemainingPlayouts() <
+                    best_node_n - cur_iters[idx].GetN()) {
+              continue;
+            }
+            // If root move filter exists, make sure move is in the list.
+            if (!root_move_filter.empty() &&
+                std::find(root_move_filter.begin(), root_move_filter.end(),
+                          cur_iters[idx].GetMove()) == root_move_filter.end()) {
+              continue;
+            }
+          }
+
+          float score = current_score[idx];
+          if (score > best) {
+            second_best = best;
+            second_best_edge = best_edge;
+            best = score;
+            best_idx = idx;
+            best_without_u = util;
+            best_edge = cur_iters[idx];
+          } else if (score > second_best) {
+            second_best = score;
+            second_best_edge = cur_iters[idx];
+          }
+          if (can_exit) break;
+          if (nstarted == 0) {
+            // One more loop will get 2 unvisited nodes, which is sufficient to
+            // ensure second best is correct. This relies upon the fact that
+            // edges are sorted in policy decreasing order.
+            can_exit = true;
+          }
+        }
+        int new_visits = 0;
+        if (second_best_edge) {
+          int estimated_visits_to_change_best = std::numeric_limits<int>::max();
+          if (best_without_u < second_best) {
+            const auto n1 = current_nstarted[best_idx] + 1;
+            estimated_visits_to_change_best = static_cast<int>(
+                std::max(1.0f, std::min(cur_iters[best_idx].GetP() * puct_mult /
+                                                (second_best - best_without_u) -
+                                            n1 + 1,
+                                        1e9f)));
+          }
+          second_best_edge.Reset();
+          max_limit = std::min(max_limit, estimated_visits_to_change_best);
+          new_visits = std::min(cur_limit, estimated_visits_to_change_best);
+        } else {
+          // No second best - only one edge, so everything goes in here.
+          new_visits = cur_limit;
+        }
+        if (best_idx >= vtp_last_filled.back()) {
+          auto* vtp_array = visits_to_perform.back().get()->data();
+          std::fill(vtp_array + (vtp_last_filled.back() + 1),
+                    vtp_array + best_idx + 1, 0);
+        }
+        (*visits_to_perform.back())[best_idx] += new_visits;
+        cur_limit -= new_visits;
+
+        Node* child_node = best_edge.GetOrSpawnNode(/* parent */ node);
+        history.Append(best_edge.GetMove());
+        auto [child_repetitions, child_moves_left] =
+            GetRepetitions(full_path.size(), history.Last());
+        full_path.push_back({child_node, child_repetitions, child_moves_left});
+        if (child_node->TryStartScoreUpdate()) {
+          current_nstarted[best_idx]++;
+          new_visits -= 1;
+          if (ShouldStopPickingHere(child_node, false, child_repetitions)) {
+            // Reduce 1 for the visits_to_perform to ensure the collision
+            // created doesn't include this visit.
+            (*visits_to_perform.back())[best_idx] -= 1;
+            receiver->push_back(NodeToProcess::Visit(full_path, history));
+            completed_visits++;
+          } else {
+            child_node->IncrementNInFlight(new_visits);
+            current_nstarted[best_idx] += new_visits;
+          }
+          current_score[best_idx] = cur_iters[best_idx].GetP() * puct_mult /
+                                        (1 + current_nstarted[best_idx]) +
+                                    current_util[best_idx];
+        }
+        if (best_idx > vtp_last_filled.back() &&
+            (*visits_to_perform.back())[best_idx] > 0) {
+          vtp_last_filled.back() = best_idx;
+        }
+        history.Pop();
+        full_path.pop_back();
+      }
+      is_root_node = false;
+      // Actively do any splits now rather than waiting for potentially long
+      // tree walk to get there.
+      for (int i = 0; i <= vtp_last_filled.back(); i++) {
+        int child_limit = (*visits_to_perform.back())[i];
+        if (task_workers_ > 0 &&
+            child_limit > params_.GetMinimumWorkSizeForPicking() &&
+            child_limit <
+                ((collision_limit - passed_off - completed_visits) * 2 / 3) &&
+            child_limit + passed_off + completed_visits <
+                collision_limit -
+                    params_.GetMinimumRemainingWorkSizeForPicking()) {
+          Node* child_node = cur_iters[i].GetOrSpawnNode(/* parent */ node);
+          history.Append(cur_iters[i].GetMove());
+          auto [child_repetitions, child_moves_left] =
+              GetRepetitions(full_path.size(), history.Last());
+          full_path.push_back(
+              {child_node, child_repetitions, child_moves_left});
+          // Don't split if not expanded or terminal.
+          if (!ShouldStopPickingHere(child_node, false, child_repetitions)) {
+            bool passed = false;
+            {
+              // Multiple writers, so need mutex here.
+              Mutex::Lock lock(picking_tasks_mutex_);
+              // Ensure not to exceed size of reservation.
+              if (picking_tasks_.size() < MAX_TASKS) {
+                picking_tasks_.emplace_back(full_path, history, child_limit);
+                task_count_.fetch_add(1, std::memory_order_acq_rel);
+                task_added_.notify_all();
+                passed = true;
+                passed_off += child_limit;
+              }
+            }
+            if (passed) {
+              (*visits_to_perform.back())[i] = 0;
+            }
+          }
+          history.Pop();
+          full_path.pop_back();
+        }
+      }
+      // Fall through to select the first child.
+    }
+    int min_idx = current_path.back();
+    bool found_child = false;
+    if (vtp_last_filled.back() > min_idx) {
+      int idx = -1;
+      for (auto& child : node->Edges()) {
+        idx++;
+        if (idx > min_idx && (*visits_to_perform.back())[idx] > 0) {
+          current_path.back() = idx;
+          current_path.push_back(-1);
+          node = child.GetOrSpawnNode(/* parent */ node);
+          history.Append(child.GetMove());
+          std::tie(repetitions, moves_left) =
+              GetRepetitions(full_path.size(), history.Last());
+          full_path.push_back({node, repetitions, moves_left});
+          found_child = true;
+          break;
+        }
+        if (idx >= vtp_last_filled.back()) break;
+      }
+    }
+    if (!found_child) {
+      history.Pop();
+      full_path.pop_back();
+      if (full_path.size() > 0) {
+        std::tie(node, repetitions, moves_left) = full_path.back();
+      } else {
+        node = nullptr;
+        repetitions = 0;
+      }
+      current_path.pop_back();
+      vtp_buffer.push_back(std::move(visits_to_perform.back()));
+      visits_to_perform.pop_back();
+      vtp_last_filled.pop_back();
+    }
+  }
+}
+
+void SearchWorker::ExtendNode(NodeToProcess& picked_node) {
+  const auto path = picked_node.path;
+  assert(!std::get<0>(path.back())->GetLowNode());
+
+  const PositionHistory& history = picked_node.history;
+
+  // We don't need the mutex because other threads will see that N=0 and
+  // N-in-flight=1 and will not touch this node.
+  const auto& board = history.Last().GetBoard();
+  std::vector<Move> legal_moves = board.GenerateLegalMoves();
+
+  // Check whether it's a draw/lose by position. Importantly, we must check
+  // these before doing the by-rule checks below.
+  auto node = picked_node.node;
+  if (legal_moves.empty()) {
+    // Could be a checkmate or a stalemate
+    if (board.IsUnderCheck()) {
+      node->MakeTerminal(GameResult::WHITE_WON);
+    } else {
+      node->MakeTerminal(GameResult::DRAW);
+    }
+    return;
+  }
+
+  // We can shortcircuit these draws-by-rule only if they aren't root;
+  // if they are root, then thinking about them is the point.
+  if (node != search_->root_node_) {
+    if (!board.HasMatingMaterial()) {
+      node->MakeTerminal(GameResult::DRAW);
+      return;
+    }
+
+    if (history.Last().GetRule50Ply() >= 100) {
+      node->MakeTerminal(GameResult::DRAW);
+      return;
+    }
+
+    // Handle repetition draws as pseudo-terminals.
+    if (picked_node.repetitions >= 2) {
+      // Not a real terminal, set low node.
+    }
+    // Neither by-position or by-rule termination, but maybe it's a TB
+    // position.
+    else if (search_->syzygy_tb_ && !search_->root_is_in_dtz_ &&
+             board.castlings().no_legal_castle() &&
+             history.Last().GetRule50Ply() == 0 &&
+             (board.ours() | board.theirs()).count() <=
+                 search_->syzygy_tb_->max_cardinality()) {
+      ProbeState state;
+      const WDLScore wdl =
+          search_->syzygy_tb_->probe_wdl(history.Last(), &state);
+      // Only fail state means the WDL is wrong, probe_wdl may produce correct
+      // result with a stat other than OK.
+      if (state != FAIL) {
+        // TB nodes don't have NN evaluation, assign M from parent node.
+        float m = 0.0f;
+        if (path.size() > 1) {
+          auto parent = std::get<0>(path[path.size() - 2]);
+          m = std::max(0.0f, parent->GetM() - 1.0f);
+        }
+        // If the colors seem backwards, check the checkmate check above.
+        if (wdl == WDL_WIN) {
+          node->MakeTerminal(GameResult::BLACK_WON, m, Terminal::Tablebase);
+        } else if (wdl == WDL_LOSS) {
+          node->MakeTerminal(GameResult::WHITE_WON, m, Terminal::Tablebase);
+        } else {  // Cursed wins and blessed losses count as draws.
+          node->MakeTerminal(GameResult::DRAW, m, Terminal::Tablebase);
+        }
+        search_->tb_hits_.fetch_add(1, std::memory_order_acq_rel);
+        return;
+      }
+    }
+  }
+
+  // Check the transposition table first and NN cache second before asking for
+  // NN evaluation.
+  picked_node.hash = history.HashLast(params_.GetCacheHistoryLength() + 1);
+  auto tt_iter = search_->tt_->find(picked_node.hash);
+  // Transposition table entry might be expired.
+  if (tt_iter != search_->tt_->end()) {
+    picked_node.tt_low_node = tt_iter->second.lock();
+  }
+  if (picked_node.tt_low_node) {
+    assert(!tt_iter->second.expired());
+    picked_node.is_tt_hit = true;
+  } else {
+    picked_node.tt_low_node = std::make_shared<LowNode>(legal_moves);
+    picked_node.nn_queried = true;
+    picked_node.eval->p.resize(legal_moves.size());
+    picked_node.is_cache_hit = computation_->AddInput(
+                                   EvalPosition{
+                                       .pos = history.GetPositions(),
+                                       .legal_moves = legal_moves,
+                                   },
+                                   picked_node.eval->AsPtr()) ==
+                               BackendComputation::FETCHED_IMMEDIATELY;
+  }
+}
+
+// 4. Run NN computation.
+// ~~~~~~~~~~~~~~~~~~~~~~
+void SearchWorker::RunNNComputation() {
+  if (computation_->UsedBatchSize() > 0) computation_->ComputeBlocking();
+}
+
+// 5. Retrieve NN computations (and terminal values) into nodes.
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+void SearchWorker::FetchMinibatchResults() {
+  LCTRACE_FUNCTION_SCOPE;
+  // Populate NN/cached results, or terminal results, into nodes.
+  for (auto& node_to_process : minibatch_) {
+    FetchSingleNodeResult(&node_to_process);
+  }
+}
+
+void SearchWorker::FetchSingleNodeResult(NodeToProcess* node_to_process) {
+  if (!node_to_process->nn_queried) return;
+
+  auto wdl_rescale = [&]() {
+    if (params_.GetWDLRescaleRatio() != 1.0f ||
+        (params_.GetWDLRescaleDiff() != 0.0f &&
+         search_->contempt_mode_ != ContemptMode::NONE)) {
+      // Check whether root moves are from the set perspective.
+      bool root_stm = search_->contempt_mode_ == ContemptMode::WHITE;
+      auto sign = (root_stm ^ node_to_process->history.IsBlackToMove())
+                      ? 1.0f
+                      : -1.0f;
+      WDLRescale(node_to_process->eval->q, node_to_process->eval->d,
+                 params_.GetWDLRescaleRatio(),
+                 search_->contempt_mode_ == ContemptMode::NONE
+                     ? 0
+                     : params_.GetWDLRescaleDiff(),
+                 sign, false, params_.GetWDLMaxS());
+    }
+  };
+  wdl_rescale();
+  node_to_process->tt_low_node->SetNNEval(node_to_process->eval.get());
+  node_to_process->tt_low_node->SortEdges();
+
+  // Add NN results to node.
+  Node* node = node_to_process->node;
+  // Add Dirichlet noise if enabled and at root.
+  if (params_.GetNoiseEpsilon() && node == search_->root_node_) {
+    ApplyDirichletNoise(node_to_process->tt_low_node.get(),
+                        params_.GetNoiseEpsilon(), params_.GetNoiseAlpha());
+    node_to_process->tt_low_node->SortEdges();
+  }
+}
+
+// 6. Propagate the new nodes' information to all their parents in the tree.
+// ~~~~~~~~~~~~~~
+void SearchWorker::DoBackupUpdate() {
+  LCTRACE_FUNCTION_SCOPE;
+  // Nodes mutex for doing node updates.
+  SharedMutex::Lock lock(search_->nodes_mutex_);
+
+  bool work_done = number_out_of_order_ > 0;
+  for (const NodeToProcess& node_to_process : minibatch_) {
+    DoBackupUpdateSingleNode(node_to_process);
+    if (!node_to_process.IsCollision()) {
+      work_done = true;
+    }
+  }
+  if (!work_done) return;
+  search_->total_batches_ += 1;
+}
+
+bool SearchWorker::MaybeAdjustForTerminalOrTransposition(
+    Node* n, const std::shared_ptr<LowNode>& nl, float& v, float& d, float& m,
+    uint32_t& n_to_fix, float& v_delta, float& d_delta, float& m_delta,
+    bool& update_parent_bounds) const {
+  if (n->IsTerminal()) {
+    v = n->GetWL();
+    d = n->GetD();
+    m = n->GetM();
+
+    return true;
+  }
+
+  // Use information from transposition or a new terminal.
+  if (nl->IsTransposition() || nl->IsTerminal() || n->GetN() < nl->GetN()) {
+    // Adapt information from low node to node by flipping Q sign, bounds,
+    // result and incrementing m.
+    v = -nl->GetWL();
+    d = nl->GetD();
+    m = nl->GetM() + 1;
+    // When starting at or going through a transposition/terminal, make sure to
+    // use the information it has already acquired.
+    n_to_fix = n->GetN();
+    v_delta = v - n->GetWL();
+    d_delta = d - n->GetD();
+    m_delta = m - n->GetM();
+    // Update bounds.
+    if (params_.GetStickyEndgames()) {
+      auto tt = nl->GetTerminalType();
+      if (tt != Terminal::NonTerminal) {
+        GameResult r;
+        if (v == 1.0f) {
+          r = GameResult::WHITE_WON;
+        } else if (v == -1.0f) {
+          r = GameResult::BLACK_WON;
+        } else {
+          r = GameResult::DRAW;
+        }
+
+        n->MakeTerminal(r, m, tt);
+        update_parent_bounds = true;
+      } else {
+        auto [lower, upper] = nl->GetBounds();
+        n->SetBounds(-upper, -lower);
+      }
+    }
+
+    return true;
+  }
+
+  return false;
+}
+
+// Use information from terminal status or low node to update node and node's
+// parent low node and so on until the root is reached. Low node may become a
+// transposition and/or get more information even during this batch. Both low
+// node and node may adjust bounds and become a terminal during this batch.
+void SearchWorker::DoBackupUpdateSingleNode(
+    const NodeToProcess& node_to_process) REQUIRES(search_->nodes_mutex_) {
+  if (node_to_process.IsCollision()) {
+    return;
+  }
+
+  auto path = node_to_process.path;
+
+  if (node_to_process.nn_queried) {
+    auto [tt_iter, is_tt_miss] = search_->tt_->try_emplace(
+        node_to_process.hash, node_to_process.tt_low_node);
+    if (is_tt_miss) {
+      assert(!tt_iter->second.expired());
+      node_to_process.node->SetLowNode(node_to_process.tt_low_node);
+    } else {
+      auto tt_low_node = tt_iter->second.lock();
+      if (!tt_low_node) {
+        tt_iter->second = node_to_process.tt_low_node;
+        node_to_process.node->SetLowNode(node_to_process.tt_low_node);
+      } else {
+        assert(!tt_iter->second.expired());
+        node_to_process.node->SetLowNode(tt_low_node);
+      }
+    }
+  } else if (node_to_process.is_tt_hit) {
+    node_to_process.node->SetLowNode(node_to_process.tt_low_node);
+  }
+
+  auto [n, nr, nm] = path.back();
+  // For the first visit to a terminal, maybe update parent bounds too.
+  auto update_parent_bounds =
+      params_.GetStickyEndgames() && n->IsTerminal() && !n->GetN();
+  const auto& nl = n->GetLowNode();
+  float v = 0.0f;
+  float d = 0.0f;
+  float m = 0.0f;
+  uint32_t n_to_fix = 0;
+  float v_delta = 0.0f;
+  float d_delta = 0.0f;
+  float m_delta = 0.0f;
+
+  // Update the low node at the start of the backup path first, but only visit
+  // it the first time that backup sees it.
+  if (nl && nl->GetN() == 0) {
+    nl->FinalizeScoreUpdate(nl->GetWL(), nl->GetD(), nl->GetM(),
+                            node_to_process.multivisit);
+  }
+
+  if (nr >= 2) {
+    // Three-fold itself has to be handled as a terminal to produce relevant
+    // results. Unlike two-folds that can keep updating their "real" values.
+    n->SetRepetition();
+    v = 0.0f;
+    d = 1.0f;
+    m = 1;
+  } else if (!MaybeAdjustForTerminalOrTransposition(n, nl, v, d, m, n_to_fix,
+                                                    v_delta, d_delta, m_delta,
+                                                    update_parent_bounds)) {
+    // If there is nothing better, use original NN values adjusted for node.
+    v = -nl->GetWL();
+    d = nl->GetD();
+    m = nl->GetM() + 1;
+  }
+
+  // Backup V value up to a root. After 1 visit, V = Q.
+  for (auto it = path.crbegin(); it != path.crend();
+       /* ++it in the body */) {
+    n->FinalizeScoreUpdate(v, d, m, node_to_process.multivisit);
+    if (n_to_fix > 0 && !n->IsTerminal()) {
+      n->AdjustForTerminal(v_delta, d_delta, m_delta, n_to_fix);
+    }
+
+    // Stop delta update on repetition "terminal" and propagate a draw above
+    // repetitions valid on the current path.
+    // Only do this after edge update to have good values if play goes here.
+    if (nr == 1 && !n->IsTerminal()) {
+      n->SetRepetition();
+      v = 0.0f;
+      d = 1.0f;
+      m = nm + 1;
+    }
+    if (n->IsRepetition()) n_to_fix = 0;
+
+    // Nothing left to do without ancestors to update.
+    if (++it == path.crend()) break;
+    auto [p, pr, pm] = *it;
+    const auto& pl = p->GetLowNode();
+
+    assert(!p->IsTerminal() ||
+           (p->IsTerminal() && pl->IsTerminal() && p->GetWL() == -pl->GetWL() &&
+            p->GetD() == pl->GetD()));
+    // If parent low node is already a (new) terminal, then change propagated
+    // values and stop terminal adjustment.
+    if (pl->IsTerminal()) {
+      v = pl->GetWL();
+      d = pl->GetD();
+      m = pl->GetM();
+      n_to_fix = 0;
+    }
+    pl->FinalizeScoreUpdate(v, d, m, node_to_process.multivisit);
+    if (n_to_fix > 0) {
+      pl->AdjustForTerminal(v_delta, d_delta, m_delta, n_to_fix);
+    }
+
+    bool old_update_parent_bounds = update_parent_bounds;
+    // Try setting parent bounds except the root or those already terminal.
+    update_parent_bounds =
+        update_parent_bounds && p != search_->root_node_ && !pl->IsTerminal() &&
+        MaybeSetBounds(p, m, &n_to_fix, &v_delta, &d_delta, &m_delta);
+
+    // Q will be flipped for opponent.
+    v = -v;
+    v_delta = -v_delta;
+    m++;
+
+    MaybeAdjustForTerminalOrTransposition(p, pl, v, d, m, n_to_fix, v_delta,
+                                          d_delta, m_delta,
+                                          update_parent_bounds);
+
+    // Update the stats.
+    // Best move.
+    // If update_parent_bounds was set, we just adjusted bounds on the
+    // previous loop or there was no previous loop, so if n is a terminal, it
+    // just became that way and could be a candidate for changing the current
+    // best edge. Otherwise a visit can only change best edge if its to an edge
+    // that isn't already the best and the new n is equal or greater to the old
+    // n.
+    if (p == search_->root_node_ &&
+        ((old_update_parent_bounds && n->IsTerminal()) ||
+         (n != search_->current_best_edge_.node() &&
+          search_->current_best_edge_.GetN() <= n->GetN()))) {
+      search_->current_best_edge_ =
+          search_->GetBestChildNoTemperature(search_->root_node_, 0);
+    }
+
+    n = p;
+    nr = pr;
+    nm = pm;
+  }
+  search_->total_playouts_ += node_to_process.multivisit;
+  if (node_to_process.nn_queried && !node_to_process.is_cache_hit) {
+    search_->network_evaluations_++;
+  }
+  search_->cum_depth_ +=
+      node_to_process.path.size() * node_to_process.multivisit;
+  search_->max_depth_ =
+      std::max(search_->max_depth_, (uint16_t)node_to_process.path.size());
+}
+
+bool SearchWorker::MaybeSetBounds(Node* p, float m, uint32_t* n_to_fix,
+                                  float* v_delta, float* d_delta,
+                                  float* m_delta) const {
+  auto losing_m = 0.0f;
+  auto prefer_tb = false;
+
+  // Determine the maximum (lower, upper) bounds across all edges.
+  // (-1,-1) Loss (initial and lowest bounds)
+  // (-1, 0) Can't Win
+  // (-1, 1) Regular node
+  // ( 0, 0) Draw
+  // ( 0, 1) Can't Lose
+  // ( 1, 1) Win (highest bounds)
+  auto lower = GameResult::BLACK_WON;
+  auto upper = GameResult::BLACK_WON;
+  for (const auto& edge : p->Edges()) {
+    const auto [edge_lower, edge_upper] = edge.GetBounds();
+    lower = std::max(edge_lower, lower);
+    upper = std::max(edge_upper, upper);
+
+    // Checkmate is the best, so short-circuit.
+    const auto is_tb = edge.IsTbTerminal();
+    if (edge_lower == GameResult::WHITE_WON && !is_tb) {
+      prefer_tb = false;
+      break;
+    } else if (edge_upper == GameResult::BLACK_WON) {
+      // Track the longest loss.
+      losing_m = std::max(losing_m, edge.GetM(0.0f));
+    }
+    prefer_tb = prefer_tb || is_tb;
+  }
+
+  // The parent's bounds are flipped from the children (-max(U), -max(L))
+  // aggregated as if it was a single child (forced move) of the same bound.
+  //       Loss (-1,-1) -> ( 1, 1) Win
+  //  Can't Win (-1, 0) -> ( 0, 1) Can't Lose
+  //    Regular (-1, 1) -> (-1, 1) Regular
+  //       Draw ( 0, 0) -> ( 0, 0) Draw
+  // Can't Lose ( 0, 1) -> (-1, 0) Can't Win
+  //        Win ( 1, 1) -> (-1,-1) Loss
+
+  // Nothing left to do for ancestors if the parent would be a regular node.
+  const auto& pl = p->GetLowNode();
+  if (lower == GameResult::BLACK_WON && upper == GameResult::WHITE_WON) {
+    return false;
+  } else if (lower == upper) {
+    // Search can stop at the parent if the bounds can't change anymore, so make
+    // it terminal preferring shorter wins and longer losses.
+    *n_to_fix = p->GetN();
+    assert(*n_to_fix > 0);
+    pl->MakeTerminal(
+        upper, (upper == GameResult::BLACK_WON ? std::max(losing_m, m) : m),
+        prefer_tb ? Terminal::Tablebase : Terminal::EndOfGame);
+    // v, d and m will be set in MaybeAdjustForTerminalOrTransposition.
+    *v_delta = pl->GetWL() + p->GetWL();
+    *d_delta = pl->GetD() - p->GetD();
+    *m_delta = pl->GetM() + 1 - p->GetM();
+    p->MakeTerminal(
+        -upper,
+        (upper == GameResult::BLACK_WON ? std::max(losing_m, m) : m) + 1.0f,
+        prefer_tb ? Terminal::Tablebase : Terminal::EndOfGame);
+  } else {
+    pl->SetBounds(lower, upper);
+    p->SetBounds(-upper, -lower);
+  }
+
+  // Bounds were set, so indicate we should check the parent too.
+  return true;
+}
+
+// 7. Update the Search's status and progress information.
+//~~~~~~~~~~~~~~~~~~~~
+void SearchWorker::UpdateCounters() {
+  LCTRACE_FUNCTION_SCOPE;
+  search_->PopulateCommonIterationStats(&iteration_stats_);
+  search_->MaybeTriggerStop(iteration_stats_, &latest_time_manager_hints_);
+  search_->MaybeOutputInfo(iteration_stats_);
+
+  // If this thread had no work, not even out of order, then sleep for some
+  // milliseconds. Collisions don't count as work, so have to enumerate to find
+  // out if there was anything done.
+  bool work_done = number_out_of_order_ > 0;
+  if (!work_done) {
+    for (NodeToProcess& node_to_process : minibatch_) {
+      if (!node_to_process.IsCollision()) {
+        work_done = true;
+        break;
+      }
+    }
+  }
+  if (!work_done) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  }
+}
+
+}  // namespace dag_classic
+}  // namespace lczero
diff --git a/src/search/dag_classic/search.h b/src/search/dag_classic/search.h
new file mode 100644
index 0000000000..a60fb7e705
--- /dev/null
+++ b/src/search/dag_classic/search.h
@@ -0,0 +1,515 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2018-2023 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#pragma once
+
+#include <array>
+#include <condition_variable>
+#include <functional>
+#include <optional>
+#include <shared_mutex>
+#include <thread>
+#include <tuple>
+#include <vector>
+
+#include "chess/callbacks.h"
+#include "chess/uciloop.h"
+#include "neural/backend.h"
+#include "search/classic/stoppers/timemgr.h"
+#include "search/dag_classic/node.h"
+#include "search/dag_classic/params.h"
+#include "syzygy/syzygy.h"
+#include "utils/logging.h"
+#include "utils/mutex.h"
+
+namespace lczero {
+namespace dag_classic {
+
+// The tuple elements are (node, repetitons, moves left).
+typedef std::vector<std::tuple<Node*, int, int>> BackupPath;
+
+class Search {
+ public:
+  Search(const NodeTree& tree, Backend* backend,
+         std::unique_ptr<UciResponder> uci_responder,
+         const MoveList& searchmoves,
+         std::chrono::steady_clock::time_point start_time,
+         std::unique_ptr<classic::SearchStopper> stopper, bool infinite,
+         bool ponder, const OptionsDict& options, TranspositionTable* tt,
+         SyzygyTablebase* syzygy_tb);
+
+  ~Search();
+
+  // Starts worker threads and returns immediately.
+  void StartThreads(size_t how_many);
+
+  // Starts search with k threads and wait until it finishes.
+  void RunBlocking(size_t threads);
+
+  // Stops search. At the end bestmove will be returned. The function is not
+  // blocking, so it returns before search is actually done.
+  void Stop();
+  // Stops search, but does not return bestmove. The function is not blocking.
+  void Abort();
+  // Blocks until all worker thread finish.
+  void Wait();
+  // Returns whether search is active. Workers check that to see whether another
+  // search iteration is needed.
+  bool IsSearchActive() const;
+
+  // Returns best move, from the point of view of white player. And also ponder.
+  // May or may not use temperature, according to the settings.
+  std::pair<Move, Move> GetBestMove();
+
+  // Returns the evaluation of the best move, WITHOUT temperature. This differs
+  // from the above function; with temperature enabled, these two functions may
+  // return results from different possible moves. If @move and @is_terminal are
+  // not nullptr they are set to the best move and whether it leads to a
+  // terminal node respectively.
+  Eval GetBestEval(Move* move = nullptr, bool* is_terminal = nullptr) const;
+  // Returns the total number of playouts in the search.
+  std::int64_t GetTotalPlayouts() const;
+  // Returns the search parameters.
+  const SearchParams& GetParams() const { return params_; }
+
+  // If called after GetBestMove, another call to GetBestMove will have results
+  // from temperature having been applied again.
+  void ResetBestMove();
+
+  void RecordNPSStartTime();
+
+ private:
+  // Computes the best move, maybe with temperature (according to the settings).
+  void EnsureBestMoveKnown();
+
+  // Returns a child with most visits, with or without temperature.
+  // NoTemperature is safe to use on non-extended nodes, while WithTemperature
+  // accepts only nodes with at least 1 visited child.
+  EdgeAndNode GetBestChildNoTemperature(Node* parent, int depth) const;
+  std::vector<EdgeAndNode> GetBestChildrenNoTemperature(Node* parent, int count,
+                                                        int depth) const;
+  EdgeAndNode GetBestRootChildWithTemperature(float temperature) const;
+
+  int64_t GetTimeSinceStart() const;
+  int64_t GetTimeSinceFirstBatch() const;
+  void MaybeTriggerStop(const classic::IterationStats& stats,
+                        classic::StoppersHints* hints);
+  void MaybeOutputInfo(const classic::IterationStats& stats);
+  // Requires nodes_mutex_ to be held.
+  void SendUciInfo(const classic::IterationStats& stats);
+  // Sets stop to true and notifies watchdog thread.
+  void FireStopInternal();
+
+  void SendMovesStats() const;
+  // Function which runs in a separate thread and watches for time and
+  // uci `stop` command;
+  void WatchdogThread();
+
+  // Fills IterationStats with global (rather than per-thread) portion of search
+  // statistics. Currently all stats there (in IterationStats) are global
+  // though.
+  void PopulateCommonIterationStats(classic::IterationStats* stats);
+
+  // Returns verbose information about given node, as vector of strings.
+  // Node can only be root or ponder (depth 1) and move_to_node is only given
+  // for the ponder node.
+  std::vector<std::string> GetVerboseStats(
+      const Node* node, std::optional<Move> move_to_node) const;
+
+  // Returns the draw score at the root of the search. At odd depth pass true to
+  // the value of @is_odd_depth to change the sign of the draw score.
+  // Depth of a root node is 0 (even number).
+  float GetDrawScore(bool is_odd_depth) const;
+
+  mutable Mutex counters_mutex_ ACQUIRED_AFTER(nodes_mutex_);
+  // Tells all threads to stop.
+  std::atomic<bool> stop_{false};
+  // Condition variable used to watch stop_ variable.
+  std::condition_variable watchdog_cv_;
+  // Tells whether it's ok to respond bestmove when limits are reached.
+  // If false (e.g. during ponder or `go infinite`) the search stops but nothing
+  // is responded until `stop` uci command.
+  bool ok_to_respond_bestmove_ GUARDED_BY(counters_mutex_) = true;
+  // There is already one thread that responded bestmove, other threads
+  // should not do that.
+  bool bestmove_is_sent_ GUARDED_BY(counters_mutex_) = false;
+  // Node garbage collection has been started for this search.
+  bool gc_started_ GUARDED_BY(counters_mutex_) = false;
+  // Stored so that in the case of non-zero temperature GetBestMove() returns
+  // consistent results.
+  Move final_bestmove_ GUARDED_BY(counters_mutex_);
+  Move final_pondermove_ GUARDED_BY(counters_mutex_);
+  std::unique_ptr<classic::SearchStopper> stopper_ GUARDED_BY(counters_mutex_);
+
+  Mutex threads_mutex_;
+  std::vector<std::thread> threads_ GUARDED_BY(threads_mutex_);
+
+  Node* root_node_;
+  TranspositionTable* tt_;
+  SyzygyTablebase* syzygy_tb_;
+  // Fixed positions which happened before the search.
+  const PositionHistory& played_history_;
+
+  Backend* const backend_;
+  BackendAttributes backend_attributes_;
+  const SearchParams params_;
+  const MoveList searchmoves_;
+  const std::chrono::steady_clock::time_point start_time_;
+  int64_t initial_visits_;
+  // root_is_in_dtz_ must be initialized before root_move_filter_.
+  bool root_is_in_dtz_ = false;
+  // tb_hits_ must be initialized before root_move_filter_.
+  std::atomic<int> tb_hits_{0};
+  const MoveList root_move_filter_;
+
+  mutable SharedMutex nodes_mutex_;
+  EdgeAndNode current_best_edge_ GUARDED_BY(nodes_mutex_);
+  Edge* last_outputted_info_edge_ GUARDED_BY(nodes_mutex_) = nullptr;
+  ThinkingInfo last_outputted_uci_info_ GUARDED_BY(nodes_mutex_);
+  int64_t total_playouts_ GUARDED_BY(nodes_mutex_) = 0;
+  int64_t network_evaluations_ GUARDED_BY(nodes_mutex_) = 0;
+  int64_t total_batches_ GUARDED_BY(nodes_mutex_) = 0;
+  // Maximum search depth = length of longest path taken in PickNodetoExtend.
+  uint16_t max_depth_ GUARDED_BY(nodes_mutex_) = 0;
+  // Cumulative depth of all paths taken in PickNodetoExtend.
+  uint64_t cum_depth_ GUARDED_BY(nodes_mutex_) = 0;
+
+  // The start time of search. It is set when the first thread exits
+  // GatherMinibatch. It is guarded by nodes mutex until set once.
+  std::optional<std::chrono::steady_clock::time_point> nps_start_time_;
+
+  std::atomic<int> pending_searchers_{0};
+  std::atomic<int> backend_waiting_counter_{0};
+  std::atomic<int> thread_count_{0};
+
+  std::unique_ptr<UciResponder> uci_responder_;
+  ContemptMode contempt_mode_;
+  friend class SearchWorker;
+};
+
+// Single thread worker of the search engine.
+// That used to be just a function Search::Worker(), but to parallelize it
+// within one thread, have to split into stages.
+class SearchWorker {
+ public:
+  static constexpr int kTaskCountDigits = std::numeric_limits<int>::digits + 1;
+  static constexpr int kTasksTakenShift = kTaskCountDigits/2;
+  static constexpr int kTasksTakenOne = 1 << kTasksTakenShift;
+  // Suspend is -1 for the low half.
+  static constexpr int kTaskCountSuspend = kTasksTakenOne - 1;
+
+  SearchWorker(Search* search, const SearchParams& params)
+      : search_(search),
+        history_(search_->played_history_),
+        params_(params),
+        moves_left_support_(search_->backend_attributes_.has_mlh) {
+    task_workers_ = params.GetTaskWorkersPerSearchWorker();
+    if (task_workers_ < 0) {
+      if (search_->backend_attributes_.runs_on_cpu) {
+        task_workers_ = 0;
+      } else {
+        int working_threads = std::max(
+            search_->thread_count_.load(std::memory_order_acquire) - 1, 1);
+        task_workers_ = std::min(
+            std::thread::hardware_concurrency() / working_threads - 1, 4U);
+      }
+    }
+    for (int i = 0; i < task_workers_; i++) {
+      task_workspaces_.emplace_back();
+      task_threads_.emplace_back([this, i]() {
+          LOGFILE << "Task worker " << i << " starting.";
+          this->RunTasks(i);
+          LOGFILE << "Task worker " << i << " exiting.";
+        });
+    }
+    target_minibatch_size_ = params_.GetMiniBatchSize();
+    if (target_minibatch_size_ == 0) {
+      target_minibatch_size_ =
+          search_->backend_attributes_.recommended_batch_size;
+    }
+    max_out_of_order_ =
+        std::max(1, static_cast<int>(params_.GetMaxOutOfOrderEvalsFactor() *
+                                     target_minibatch_size_));
+  }
+
+  ~SearchWorker();
+
+  // Runs iterations while needed.
+  void RunBlocking() {
+    LOGFILE << "Started search thread.";
+    try {
+      // A very early stop may arrive before this point, so the test is at the
+      // end to ensure at least one iteration runs before exiting.
+      do {
+        ExecuteOneIteration();
+      } while (search_->IsSearchActive());
+    } catch (std::exception& e) {
+      std::cerr << "Unhandled exception in worker thread: " << e.what()
+                << std::endl;
+      abort();
+    }
+  }
+
+  // Does one full iteration of MCTS search:
+  // 1. Initialize internal structures.
+  // 2. Gather minibatch.
+  // 3.
+  // 4. Run NN computation.
+  // 5. Retrieve NN computations (and terminal values) into nodes.
+  // 6. Propagate the new nodes' information to all their parents in the tree.
+  // 7. Update the Search's status and progress information.
+  void ExecuteOneIteration();
+
+  // The same operations one by one:
+  // 1. Initialize internal structures.
+  // @computation is the computation to use on this iteration.
+  void InitializeIteration();
+
+  // 2. Gather minibatch.
+  void GatherMinibatch();
+
+  // 2b. Copy collisions into shared_collisions_.
+  void CollectCollisions();
+
+  // 4. Run NN computation.
+  void RunNNComputation();
+
+  // 5. Retrieve NN computations (and terminal values) into nodes.
+  void FetchMinibatchResults();
+
+  // 6. Propagate the new nodes' information to all their parents in the tree.
+  void DoBackupUpdate();
+
+  // 7. Update the Search's status and progress information.
+  void UpdateCounters();
+
+ private:
+  struct NodeToProcess {
+    bool IsExtendable() const {
+      return !is_collision && !node->IsTerminal() && !node->GetLowNode();
+    }
+    bool IsCollision() const { return is_collision; }
+    bool CanEvalOutOfOrder() const {
+      return is_tt_hit || is_cache_hit || node->IsTerminal() ||
+             node->GetLowNode();
+    }
+
+    // The path to the node to extend.
+    BackupPath path;
+    // The node to extend.
+    Node* node;
+    std::unique_ptr<EvalResult> eval;
+    int multivisit = 0;
+    // If greater than multivisit, and other parameters don't imply a lower
+    // limit, multivist could be increased to this value without additional
+    // change in outcome of next selection.
+    int maxvisit = 0;
+    bool nn_queried = false;
+    bool is_tt_hit = false;
+    bool is_cache_hit = false;
+    bool is_collision = false;
+
+    // Details that are filled in as we go.
+    uint64_t hash;
+    std::shared_ptr<LowNode> tt_low_node;
+    PositionHistory history;
+    bool ooo_completed = false;
+
+    // Repetition draws.
+    int repetitions = 0;
+
+    static NodeToProcess Collision(const BackupPath& path, int collision_count,
+                                   int max_count) {
+      return NodeToProcess(path, collision_count, max_count);
+    }
+    static NodeToProcess Visit(const BackupPath& path,
+                               const PositionHistory& history) {
+      return NodeToProcess(path, history);
+    }
+
+    std::string DebugString() const {
+      std::ostringstream oss;
+      oss << "<NodeToProcess> This:" << this << " Depth:" << path.size()
+          << " Node:" << node << " Multivisit:" << multivisit
+          << " Maxvisit:" << maxvisit << " NNQueried:" << nn_queried
+          << " TTHit:" << is_tt_hit << " CacheHit:" << is_cache_hit
+          << " Collision:" << is_collision << " OOO:" << ooo_completed
+          << " Repetitions:" << repetitions << " Path:";
+      for (auto it = path.cbegin(); it != path.cend(); ++it) {
+        if (it != path.cbegin()) oss << "->";
+        auto n = std::get<0>(*it);
+        const auto& nl = n->GetLowNode();
+        oss << n << ":" << n->GetNInFlight();
+        if (nl) {
+          oss << "(" << nl << ")";
+        }
+      }
+      oss << " --- " << std::get<0>(path.back())->DebugString();
+      if (node->GetLowNode())
+        oss << " --- " << node->GetLowNode()->DebugString();
+
+      return oss.str();
+    }
+
+   private:
+    NodeToProcess(const BackupPath& path, uint32_t multivisit,
+                  uint32_t max_count)
+        : path(path),
+          node(std::get<0>(path.back())),
+          eval(std::make_unique<EvalResult>()),
+          multivisit(multivisit),
+          maxvisit(max_count),
+          is_collision(true),
+          repetitions(0) {}
+    NodeToProcess(const BackupPath& path, const PositionHistory& in_history)
+        : path(path),
+          node(std::get<0>(path.back())),
+          eval(std::make_unique<EvalResult>()),
+          multivisit(1),
+          maxvisit(0),
+          is_collision(false),
+          history(in_history),
+          repetitions(std::get<1>(path.back())) {}
+  };
+
+  // Holds per task worker scratch data
+  struct TaskWorkspace {
+    std::array<Node::Iterator, 256> cur_iters;
+    std::vector<std::unique_ptr<std::array<int, 256>>> vtp_buffer;
+    std::vector<std::unique_ptr<std::array<int, 256>>> visits_to_perform;
+    std::vector<int> vtp_last_filled;
+    std::vector<int> current_path;
+    BackupPath full_path;
+    TaskWorkspace() {
+      vtp_buffer.reserve(30);
+      visits_to_perform.reserve(30);
+      vtp_last_filled.reserve(30);
+      current_path.reserve(30);
+      full_path.reserve(30);
+    }
+  };
+
+  struct PickTask {
+    enum PickTaskType { kGathering, kProcessing };
+    PickTaskType task_type;
+
+    // For task type gathering.
+    BackupPath start_path;
+    Node* start;
+    int collision_limit;
+    PositionHistory history;
+    std::vector<NodeToProcess> results;
+
+    // Task type post gather processing.
+    int start_idx;
+    int end_idx;
+
+    bool complete = false;
+
+    PickTask(const BackupPath& start_path, const PositionHistory& in_history,
+             int collision_limit)
+        : task_type(kGathering),
+          start_path(start_path),
+          start(std::get<0>(start_path.back())),
+          collision_limit(collision_limit),
+          history(in_history) {}
+    PickTask(int start_idx, int end_idx)
+        : task_type(kProcessing), start_idx(start_idx), end_idx(end_idx) {}
+  };
+
+  NodeToProcess PickNodeToExtend(int collision_limit);
+  // Adjust parameters for updating node @n and its parent low node if node is
+  // terminal or its child low node is a transposition. Also update bounds and
+  // terminal status of node @n using information from its child low node.
+  // Return true if adjustment happened.
+  bool MaybeAdjustForTerminalOrTransposition(Node* n,
+                                             const std::shared_ptr<LowNode>& nl,
+                                             float& v, float& d, float& m,
+                                             uint32_t& n_to_fix, float& v_delta,
+                                             float& d_delta, float& m_delta,
+                                             bool& update_parent_bounds) const;
+  void DoBackupUpdateSingleNode(const NodeToProcess& node_to_process);
+  // Returns whether a node's bounds were set based on its children.
+  bool MaybeSetBounds(Node* p, float m, uint32_t* n_to_fix, float* v_delta,
+                      float* d_delta, float* m_delta) const;
+  void PickNodesToExtend(int collision_limit);
+  void PickNodesToExtendTask(const BackupPath& path, int collision_limit,
+                             PositionHistory& history,
+                             std::vector<NodeToProcess>* receiver,
+                             TaskWorkspace* workspace);
+  void CancelCollisions();
+
+  // Check if the situation described by @depth under root and @position is a
+  // safe two-fold or a draw by repetition and return the number of safe
+  // repetitions and moves_left.
+  std::pair<int, int> GetRepetitions(int depth, const Position& position);
+  // Check if there is a reason to stop picking and pick @node.
+  bool ShouldStopPickingHere(Node* node, bool is_root_node, int repetitions);
+  void ProcessPickedTask(int batch_start, int batch_end);
+  void ExtendNode(NodeToProcess& picked_node);
+  void FetchSingleNodeResult(NodeToProcess* node_to_process);
+  std::tuple<PickTask*, int, int> PickTaskToProcess();
+  void ProcessTask(PickTask* task, int id,
+                   std::vector<NodeToProcess>* receiver,
+                   TaskWorkspace* workspace);
+  void RunTasks(int tid);
+  void ResetTasks();
+  // Returns how many tasks there were.
+  int WaitForTasks();
+
+  Search* const search_;
+  // List of nodes to process.
+  std::vector<NodeToProcess> minibatch_;
+  std::unique_ptr<BackendComputation> computation_;
+  int task_workers_;
+  int target_minibatch_size_;
+  int max_out_of_order_;
+  // History is reset and extended by PickNodeToExtend().
+  PositionHistory history_;
+  int number_out_of_order_ = 0;
+  const SearchParams& params_;
+  std::unique_ptr<Node> precached_node_;
+  const bool moves_left_support_;
+  classic::IterationStats iteration_stats_;
+  classic::StoppersHints latest_time_manager_hints_;
+
+  // Multigather task related fields.
+
+  Mutex picking_tasks_mutex_;
+  std::vector<PickTask> picking_tasks_;
+  // A packed atomic. LSB half is task_count_. MSB half is tasks_taken_.
+  std::atomic<int> task_count_ = kTaskCountSuspend;
+  std::atomic<int> completed_tasks_ = 0;
+  std::condition_variable task_added_;
+  std::vector<std::thread> task_threads_;
+  std::vector<TaskWorkspace> task_workspaces_;
+  TaskWorkspace main_workspace_;
+  bool exiting_ = false;
+};
+
+}  // namespace dag_classic
+}  // namespace lczero
diff --git a/src/search/dag_classic/wrapper.cc b/src/search/dag_classic/wrapper.cc
new file mode 100644
index 0000000000..8de3b28385
--- /dev/null
+++ b/src/search/dag_classic/wrapper.cc
@@ -0,0 +1,177 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2025 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include "chess/gamestate.h"
+#include "search/classic/stoppers/factory.h"
+#include "search/dag_classic/search.h"
+#include "search/register.h"
+#include "search/search.h"
+#include "neural/shared_params.h"
+#include "utils/trace.h"
+
+namespace lczero {
+namespace dag_classic {
+namespace {
+
+const OptionId kThreadsOptionId{
+    {.long_flag = "threads",
+     .uci_option = "Threads",
+     .help_text =
+         "Number of (CPU) worker threads to use, 0 for the backend default.",
+     .short_flag = 't',
+     .visibility = OptionId::kAlwaysVisible}};
+const OptionId kClearTree{
+    {.long_flag = "",
+     .uci_option = "ClearTree",
+     .help_text = "Clear the tree before the next search.",
+     .visibility = OptionId::kProOnly}};
+
+class DagClassicSearch : public SearchBase {
+ public:
+  DagClassicSearch(UciResponder* responder, const OptionsDict* options)
+      : SearchBase(responder), options_(options) {}
+  ~DagClassicSearch() { search_.reset(); }
+
+ private:
+  void NewGame() override;
+  void SetPosition(const GameState& pos) override;
+  void StartSearch(const GoParams&) override;
+  void StartClock() override {
+    move_start_time_ = std::chrono::steady_clock::now();
+  }
+  void WaitSearch() override {
+    LOGFILE << "Waiting for the search.";
+    if (search_) search_->Wait();
+  }
+  void StopSearch() override {
+    if (search_) search_->Stop();
+  }
+  void AbortSearch() override {
+    if (search_) search_->Abort();
+  }
+
+  const OptionsDict* options_;
+  std::unique_ptr<classic::TimeManager> time_manager_;
+  std::unique_ptr<Search> search_;
+  std::unique_ptr<NodeTree> tree_;
+  TranspositionTable tt_;
+  std::optional<std::chrono::steady_clock::time_point> move_start_time_;
+};
+
+MoveList StringsToMovelist(const std::vector<std::string>& moves,
+                           const ChessBoard& board) {
+  MoveList result;
+  if (moves.size()) {
+    result.reserve(moves.size());
+    const auto legal_moves = board.GenerateLegalMoves();
+    for (const auto& move : moves) {
+      const Move m = board.ParseMove(move);
+      if (std::find(legal_moves.begin(), legal_moves.end(), m) !=
+          legal_moves.end()) {
+        result.emplace_back(m);
+      }
+    }
+    if (result.empty()) throw Exception("No legal searchmoves.");
+  }
+  return result;
+}
+
+void DagClassicSearch::NewGame() {
+  LCTRACE_FUNCTION_SCOPE;
+  LOGFILE << "New game.";
+  search_.reset();
+  tt_.clear();
+  tree_.reset();
+  time_manager_ = classic::MakeTimeManager(*options_);
+}
+
+void DagClassicSearch::SetPosition(const GameState& pos) {
+  LCTRACE_FUNCTION_SCOPE;
+  if (!tree_) tree_ = std::make_unique<NodeTree>();
+  const bool is_same_game = tree_->ResetToPosition(pos);
+  LOGFILE << "Tree reset to a new position.";
+  if (!is_same_game) time_manager_ = classic::MakeTimeManager(*options_);
+}
+
+void DagClassicSearch::StartSearch(const GoParams& params) {
+  LCTRACE_FUNCTION_SCOPE;
+  auto forwarder =
+      std::make_unique<NonOwningUciRespondForwarder>(uci_responder_);
+  if (options_->Get<Button>(kClearTree).TestAndReset()) {
+    tree_->TrimTreeAtHead();
+    LOGFILE << "Tree cleared.";
+  }
+
+  const auto cache_size =
+      options_->Get<int>(SharedBackendParams::kNNCacheSizeId);
+  // FIXME: This is too conservative.
+  const size_t kAvgNodeSize =
+      sizeof(Node) + sizeof(LowNode) +
+      classic::MemoryWatchingStopper::kAvgMovesPerPosition * sizeof(Edge);
+  const size_t kAvgCacheItemSize =
+      3 * sizeof(float) + sizeof(std::unique_ptr<float[]>) +
+      sizeof(float[classic::MemoryWatchingStopper::kAvgMovesPerPosition]);
+  size_t total_memory =
+      tree_.get()->GetCurrentHead()->GetN() * kAvgNodeSize +
+      (sizeof(TranspositionTable::value_type) + 1) * tt_.bucket_count() +
+      cache_size * kAvgCacheItemSize;
+  auto stopper = time_manager_->GetStopper(
+      params, tree_.get()->HeadPosition(), total_memory, kAvgNodeSize,
+      tree_.get()->GetCurrentHead()->GetN());
+  search_ = std::make_unique<Search>(
+      *tree_, backend_, std::move(forwarder),
+      StringsToMovelist(params.searchmoves, tree_->HeadPosition().GetBoard()),
+      *move_start_time_, std::move(stopper), params.infinite, params.ponder,
+      *options_, &tt_, syzygy_tb_);
+
+  LOGFILE << "Timer started at "
+          << FormatTime(SteadyClockToSystemClock(*move_start_time_));
+  search_->StartThreads(options_->Get<int>(kThreadsOptionId));
+}
+
+class DagClassicSearchFactory : public SearchFactory {
+  std::string_view GetName() const override { return "dag-preview"; }
+  std::unique_ptr<SearchBase> CreateSearch(
+      UciResponder* responder, const OptionsDict* options) const override {
+    LCTRACE_FUNCTION_SCOPE;
+    return std::make_unique<DagClassicSearch>(responder, options);
+  }
+
+  void PopulateParams(OptionsParser* parser) const override {
+    parser->Add<IntOption>(kThreadsOptionId, 0, 128) = 0;
+    SearchParams::Populate(parser);
+    classic::PopulateTimeManagementOptions(classic::RunType::kUci, parser);
+
+    parser->Add<ButtonOption>(kClearTree);
+  }
+};
+
+REGISTER_SEARCH(DagClassicSearchFactory)
+
+}  // namespace
+}  // namespace dag_classic
+}  // namespace lczero
diff --git a/src/search/instamove/instamove.cc b/src/search/instamove/instamove.cc
new file mode 100644
index 0000000000..554ac1ba96
--- /dev/null
+++ b/src/search/instamove/instamove.cc
@@ -0,0 +1,219 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2024 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "chess/gamestate.h"
+#include "chess/uciloop.h"
+#include "neural/backend.h"
+#include "neural/batchsplit.h"
+#include "search/register.h"
+#include "search/search.h"
+
+namespace lczero {
+namespace {
+
+class InstamoveSearch : public SearchBase {
+ public:
+  using SearchBase::SearchBase;
+
+ private:
+  virtual Move GetBestMove(const GameState& game_state) = 0;
+
+  void SetPosition(const GameState& game_state) final {
+    game_state_ = game_state;
+  }
+
+  void StartSearch(const GoParams& go_params) final {
+    responded_bestmove_.store(false, std::memory_order_relaxed);
+    bestmove_ = GetBestMove(game_state_);
+    if (!go_params.infinite && !go_params.ponder) RespondBestMove();
+  }
+  void WaitSearch() final {
+    while (!responded_bestmove_.load()) {
+      std::this_thread::sleep_for(std::chrono::milliseconds(10));
+    }
+  }
+  void StopSearch() final { RespondBestMove(); }
+  void AbortSearch() final { responded_bestmove_.store(true); }
+  void RespondBestMove() {
+    if (responded_bestmove_.exchange(true)) return;
+    BestMoveInfo info{bestmove_};
+    // TODO Remove this when move will be encoded from white perspective.
+    if (game_state_.CurrentPosition().IsBlackToMove()) {
+      info.bestmove.Flip();
+    } else if (!info.ponder.is_null()) {
+      info.ponder.Flip();
+    }
+    uci_responder_->OutputBestMove(&info);
+  }
+
+  void SetBackend(Backend* backend) override {
+    batchsplit_backend_ = CreateBatchSplitingBackend(backend);
+    backend_ = batchsplit_backend_.get();
+  }
+  void StartClock() final {}
+
+  Move bestmove_;
+  std::atomic<bool> responded_bestmove_{false};
+  std::unique_ptr<Backend> batchsplit_backend_;
+  GameState game_state_;
+};
+
+class PolicyHeadSearch : public InstamoveSearch {
+ public:
+  using InstamoveSearch::InstamoveSearch;
+
+  Move GetBestMove(const GameState& game_state) final {
+    const std::vector<Position> positions = game_state.GetPositions();
+    MoveList legal_moves = positions.back().GetBoard().GenerateLegalMoves();
+    std::vector<EvalResult> res = backend_->EvaluateBatch(
+        std::vector<EvalPosition>{EvalPosition{positions, legal_moves}});
+    const size_t best_move_idx =
+        std::max_element(res[0].p.begin(), res[0].p.end()) - res[0].p.begin();
+
+    std::vector<ThinkingInfo> infos = {{
+        .depth = 1,
+        .seldepth = 1,
+        .nodes = 1,
+        .score = 90 * std::tan(1.5637541897 * res[0].q),
+        .wdl =
+            ThinkingInfo::WDL{
+                static_cast<int>(std::round(
+                    500 * (1 + res[0].q - res[0].d))),
+                static_cast<int>(std::round(1000 * res[0].d)),
+                static_cast<int>(std::round(
+                    500 * (1 - res[0].q - res[0].d)))},
+    }};
+    uci_responder_->OutputThinkingInfo(&infos);
+
+    Move best_move = legal_moves[best_move_idx];
+    return best_move;
+  }
+};
+
+class ValueHeadSearch : public InstamoveSearch {
+ public:
+  using InstamoveSearch::InstamoveSearch;
+  Move GetBestMove(const GameState& game_state) final {
+    std::unique_ptr<BackendComputation> computation =
+        backend_->CreateComputation();
+
+    PositionHistory history(game_state.GetPositions());
+    const ChessBoard& board = history.Last().GetBoard();
+    const std::vector<Move> legal_moves = board.GenerateLegalMoves();
+
+    struct Score {
+      float negative_q;  // Negative because NN evaluates from opponent's
+                         // perspective.
+      float d;
+      std::optional<int> mate;
+
+      bool operator<(const Score& other) const {
+        // Mate always beats non-mate
+        if (mate && !other.mate) return true;
+        if (!mate && other.mate) return false;
+        // Both mates: shorter is better
+        if (mate && other.mate) return *mate < *other.mate;
+        // Neither mate: lower negative_q is better
+        return negative_q < other.negative_q;
+      }
+    };
+
+    std::vector<Score> results(legal_moves.size());
+
+    for (size_t i = 0; i < legal_moves.size(); i++) {
+      Move move = legal_moves[i];
+      history.Append(move);
+      switch (history.ComputeGameResult()) {
+        case GameResult::UNDECIDED:
+          computation->AddInput(
+              EvalPosition{history.GetPositions(), {}},
+              EvalResultPtr{.q = &results[i].negative_q, .d = &results[i].d});
+          break;
+        case GameResult::DRAW:
+          results[i] = {.negative_q = 0, .d = 1, .mate = std::nullopt};
+          break;
+        default:
+          // A legal move to a non-drawn terminal without tablebases must be a
+          // win.
+          results[i] = {.negative_q = -1, .d = 0, .mate = 1};
+      }
+      history.Pop();
+    }
+
+    computation->ComputeBlocking();
+
+    const size_t best_idx =
+        std::min_element(results.begin(), results.end()) - results.begin();
+
+    const Score& r = results[best_idx];
+    auto to_int = [](double x) { return static_cast<int>(std::round(x)); };
+    std::vector<ThinkingInfo> infos{
+        {.depth = 1,
+         .seldepth = 1,
+         .nodes = static_cast<int64_t>(legal_moves.size()),
+         .mate = r.mate,
+         .score = r.mate ? std::nullopt
+                         : std::make_optional<int>(
+                               -90 * std::tan(1.5637541897 * r.negative_q)),
+         .wdl = r.mate
+                    ? std::nullopt
+                    : std::make_optional<ThinkingInfo::WDL>(ThinkingInfo::WDL{
+                          .w = to_int(500 * (1 - r.negative_q - r.d)),
+                          .d = to_int(1000 * r.d),
+                          .l = to_int(500 * (1 + r.negative_q - r.d)),
+                      })}};
+    uci_responder_->OutputThinkingInfo(&infos);
+    Move best_move = legal_moves[best_idx];
+    return best_move;
+  }
+};
+
+class PolicyHeadFactory : public SearchFactory {
+  std::string_view GetName() const override { return "policyhead"; }
+  std::unique_ptr<SearchBase> CreateSearch(UciResponder* responder,
+                                           const OptionsDict*) const override {
+    return std::make_unique<PolicyHeadSearch>(responder);
+  }
+};
+
+class ValueHeadFactory : public SearchFactory {
+  std::string_view GetName() const override { return "valuehead"; }
+  std::unique_ptr<SearchBase> CreateSearch(UciResponder* responder,
+                                           const OptionsDict*) const override {
+    return std::make_unique<ValueHeadSearch>(responder);
+  }
+};
+
+REGISTER_SEARCH(PolicyHeadFactory)
+REGISTER_SEARCH(ValueHeadFactory)
+
+}  // namespace
+}  // namespace lczero
\ No newline at end of file
diff --git a/src/search/mock_search.h b/src/search/mock_search.h
new file mode 100644
index 0000000000..3a4b61ab2d
--- /dev/null
+++ b/src/search/mock_search.h
@@ -0,0 +1,60 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2025 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#pragma once
+
+#include <gmock/gmock.h>
+
+#include "search/search.h"
+
+namespace lczero {
+
+class MockSearch : public SearchBase {
+ public:
+  using SearchBase::SearchBase;
+  UciResponder* GetUciResponder() const { return uci_responder_; }
+  MOCK_METHOD(void, SetBackend, (Backend * backend), (override));
+  MOCK_METHOD(void, SetSyzygyTablebase, (SyzygyTablebase * tb), (override));
+  MOCK_METHOD(void, NewGame, (), (override));
+  MOCK_METHOD(void, SetPosition, (const GameState&), (override));
+  MOCK_METHOD(void, StartSearch, (const GoParams&), (override));
+  MOCK_METHOD(void, StartClock, (), (override));
+  MOCK_METHOD(void, WaitSearch, (), (override));
+  MOCK_METHOD(void, StopSearch, (), (override));
+  MOCK_METHOD(void, AbortSearch, (), (override));
+  MOCK_METHOD(SearchArtifacts, GetArtifacts, (), (const, override));
+};
+
+class MockSearchFactory : public SearchFactory {
+ public:
+  MOCK_METHOD(std::string_view, GetName, (), (const, override));
+  MOCK_METHOD(void, PopulateParams, (OptionsParser*), (const, override));
+  MOCK_METHOD(std::unique_ptr<SearchBase>, CreateSearch,
+              (UciResponder*, const OptionsDict*), (const, override));
+};
+
+}  // namespace lczero
diff --git a/src/search/register.cc b/src/search/register.cc
new file mode 100644
index 0000000000..203d4fa8a9
--- /dev/null
+++ b/src/search/register.cc
@@ -0,0 +1,59 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2018-2020 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include "search/register.h"
+
+#include <algorithm>
+
+namespace lczero {
+
+SearchManager* SearchManager::Get() {
+  static SearchManager factory;
+  return &factory;
+}
+
+void SearchManager::AddSearchFactory(std::unique_ptr<SearchFactory> algorithm) {
+  algorithms_.push_back(std::move(algorithm));
+}
+
+std::vector<std::string_view> SearchManager::GetSearchNames() const {
+  std::vector<std::string_view> res;
+  res.reserve(algorithms_.size());
+  std::transform(algorithms_.begin(), algorithms_.end(),
+                 std::back_inserter(res),
+                 [](const auto& alg) { return alg->GetName(); });
+  return res;
+}
+
+SearchFactory* SearchManager::GetFactoryByName(std::string_view name) const {
+  auto it =
+      std::find_if(algorithms_.begin(), algorithms_.end(),
+                   [name](const auto& alg) { return alg->GetName() == name; });
+  return it == algorithms_.end() ? nullptr : it->get();
+}
+
+}  // namespace lczero
\ No newline at end of file
diff --git a/src/search/register.h b/src/search/register.h
new file mode 100644
index 0000000000..1625f18a1b
--- /dev/null
+++ b/src/search/register.h
@@ -0,0 +1,69 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2024 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "search/search.h"
+
+namespace lczero {
+
+class SearchFactory;
+
+// A singleton class that keeps one instance of each search algorithm's factory.
+class SearchManager {
+ public:
+  static SearchManager* Get();
+  void AddSearchFactory(std::unique_ptr<SearchFactory>);
+
+  std::vector<std::string_view> GetSearchNames() const;
+
+  // Returns the factory for the given algorithm name. Returns nullptr if not
+  // found.
+  SearchFactory* GetFactoryByName(std::string_view name) const;
+
+  struct Register {
+    Register(std::unique_ptr<SearchFactory> factory) {
+      SearchManager::Get()->AddSearchFactory(std::move(factory));
+    }
+  };
+
+ private:
+  SearchManager() = default;
+
+  std::vector<std::unique_ptr<SearchFactory>> algorithms_;
+};
+
+#define REGISTER_SEARCH(alg)                                      \
+  namespace {                                                     \
+  [[maybe_unused]] static SearchManager::Register reg3b50Y_##alg( \
+      std::make_unique<alg>());                                   \
+  }
+
+}  // namespace lczero
\ No newline at end of file
diff --git a/src/search/search.h b/src/search/search.h
new file mode 100644
index 0000000000..40693a399b
--- /dev/null
+++ b/src/search/search.h
@@ -0,0 +1,104 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2024 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#pragma once
+
+#include <memory>
+#include <span>
+
+#include "search/artifacts.h"
+#include "utils/exception.h"
+
+namespace lczero {
+
+class Backend;
+struct GameState;
+struct GoParams;
+class OptionsDict;
+class OptionsParser;
+class UciResponder;
+class SyzygyTablebase;
+
+class SearchBase {
+ public:
+  SearchBase(UciResponder* responder) : uci_responder_(responder) {}
+  virtual ~SearchBase() = default;
+
+  // Sets objects needed by the search.
+  // They are guarnteed to be set before any other function is called, and after
+  // that, only can be changed while the search is stopped.
+  virtual void SetBackend(Backend* backend) { backend_ = backend; }
+  virtual void SetSyzygyTablebase(SyzygyTablebase* tb) { syzygy_tb_ = tb; }
+
+  // Resets search tree, and whatever else is needed to start a new game.
+  virtual void NewGame() {}
+  // Sets the position to search from in the future searches.
+  virtual void SetPosition(const GameState&) = 0;
+  // Start the search. Must not block, should return immediately.
+  virtual void StartSearch(const GoParams&) = 0;
+  // Starts the timer for the search. Must not block, should return immediately.
+  // It can be called either after or befor StartSearch(), particularly:
+  // - In the "strict timing" mode, it's called before SetPosition().
+  // - In normal mode, it's called before StartSearch().
+  // - In Ponder mode, it may potentially be called at `ponderhit` (although
+  // actually we'll stop the search, change the position and start again).
+  virtual void StartClock() = 0;
+  // Wait for the search to finish. This is blocking.
+  virtual void WaitSearch() = 0;
+  // Stops the search as soon as possible and responds with bestmove. Doesn't
+  // block.
+  virtual void StopSearch() = 0;
+  // Same as Stop(), but doesn't respond with bestmove. Doesn't block.
+  virtual void AbortSearch() = 0;
+  // Return the data needed to build a training data frame from the last search.
+  virtual SearchArtifacts GetArtifacts() const {
+    throw Exception(
+        "Training data generation is not supported for this search algorithm.");
+  }
+
+ protected:
+  UciResponder* uci_responder_ = nullptr;
+  Backend* backend_ = nullptr;
+  SyzygyTablebase* syzygy_tb_ = nullptr;
+};
+
+// Creates an environment for a given search algorithm. One instance of the
+// factory per algorithm is created at the start of the program, and registered
+// in the SearchManager.
+class SearchFactory {
+ public:
+  virtual ~SearchFactory() = default;
+  // Name of the algorithm (used in UCI options or command line).
+  virtual std::string_view GetName() const = 0;
+  // Populates the parameters of the algorithm.
+  virtual void PopulateParams(OptionsParser*) const {}
+  // Creates a new environment for the algorithm.
+  virtual std::unique_ptr<SearchBase> CreateSearch(
+      UciResponder*, const OptionsDict*) const = 0;
+};
+
+}  // namespace lczero
\ No newline at end of file
diff --git a/src/selfplay/game.cc b/src/selfplay/game.cc
index 68554c2fc3..08ba1688ac 100644
--- a/src/selfplay/game.cc
+++ b/src/selfplay/game.cc
@@ -29,8 +29,8 @@
 
 #include <algorithm>
 
-#include "mcts/stoppers/common.h"
-#include "mcts/stoppers/factory.h"
+#include "search/classic/stoppers/common.h"
+#include "search/classic/stoppers/factory.h"
 #include "utils/random.h"
 
 namespace lczero {
@@ -74,7 +74,7 @@ void SelfPlayGame::PopulateUciParams(OptionsParser* options) {
   options->Add<IntOption>(kResignEarliestMoveId, 0, 1000) = 0;
   options->Add<IntOption>(kMinimumAllowedVistsId, 0, 1000000) = 0;
   options->Add<BoolOption>(kUciChess960) = false;
-  PopulateTimeManagementOptions(RunType::kSelfplay, options);
+  PopulateTimeManagementOptions(classic::RunType::kSelfplay, options);
   options->Add<StringOption>(kSyzygyTablebaseId);
   options->Add<FloatOption>(kOpeningStopProbId, 0.0f, 1.0f) = 0.0f;
 }
@@ -84,17 +84,17 @@ SelfPlayGame::SelfPlayGame(PlayerOptions white, PlayerOptions black,
     : options_{white, black},
       chess960_{white.uci_options->Get<bool>(kUciChess960) ||
                 black.uci_options->Get<bool>(kUciChess960)},
-      training_data_(SearchParams(*white.uci_options).GetHistoryFill(),
-                     SearchParams(*black.uci_options).GetHistoryFill(),
-                     white.network->GetCapabilities().input_format) {
+      training_data_(classic::SearchParams(*white.uci_options).GetHistoryFill(),
+                     classic::SearchParams(*black.uci_options).GetHistoryFill(),
+                     pblczero::NetworkFormat::INPUT_CLASSICAL_112_PLANE) {
   orig_fen_ = opening.start_fen;
-  tree_[0] = std::make_shared<NodeTree>();
+  tree_[0] = std::make_shared<classic::NodeTree>();
   tree_[0]->ResetToPosition(orig_fen_, {});
 
   if (shared_tree) {
     tree_[1] = tree_[0];
   } else {
-    tree_[1] = std::make_shared<NodeTree>();
+    tree_[1] = std::make_shared<classic::NodeTree>();
     tree_[1]->ResetToPosition(orig_fen_, {});
   }
   int ply = 0;
@@ -120,6 +120,7 @@ SelfPlayGame::SelfPlayGame(PlayerOptions white, PlayerOptions black,
                                       exit_prob_next * (positions / 2)))) {
       break;
     }
+    if (tree_[0]->IsBlackToMove()) m.Flip();
     tree_[0]->MakeMove(m);
     if (tree_[0] != tree_[1]) tree_[1]->MakeMove(m);
     ply++;
@@ -131,11 +132,6 @@ void SelfPlayGame::Play(int white_threads, int black_threads, bool training,
                         SyzygyTablebase* syzygy_tb, bool enable_resign) {
   bool blacks_move = tree_[0]->IsBlackToMove();
 
-  // If we are training, verify that input formats are consistent.
-  if (training && options_[0].network->GetCapabilities().input_format !=
-                      options_[1].network->GetCapabilities().input_format) {
-    throw Exception("Can't mix networks with different input format!");
-  }
   // Take syzygy tablebases from player1 options.
   std::string tb_paths =
       options_[0].uci_options->Get<std::string>(kSyzygyTablebaseId);
@@ -166,23 +162,18 @@ void SelfPlayGame::Play(int white_threads, int black_threads, bool training,
       std::lock_guard<std::mutex> lock(mutex_);
       if (abort_) break;
       auto stoppers = options_[idx].search_limits.MakeSearchStopper();
-      PopulateIntrinsicStoppers(stoppers.get(), *options_[idx].uci_options);
+      classic::PopulateIntrinsicStoppers(stoppers.get(),
+                                         *options_[idx].uci_options);
 
       std::unique_ptr<UciResponder> responder =
           std::make_unique<CallbackUciResponder>(
               options_[idx].best_move_callback, options_[idx].info_callback);
 
-      if (!chess960_) {
-        // Remap FRC castling to legacy castling.
-        responder = std::make_unique<Chess960Transformer>(
-            std::move(responder), tree_[idx]->HeadPosition().GetBoard());
-      }
-
-      search_ = std::make_unique<Search>(
-          *tree_[idx], options_[idx].network, std::move(responder),
+      search_ = std::make_unique<classic::Search>(
+          *tree_[idx], options_[idx].backend, std::move(responder),
           /* searchmoves */ MoveList(), std::chrono::steady_clock::now(),
           std::move(stoppers), /* infinite */ false, /* ponder */ false,
-          *options_[idx].uci_options, options_[idx].cache, syzygy_tb);
+          *options_[idx].uci_options, syzygy_tb);
     }
 
     // Do search.
@@ -238,7 +229,7 @@ void SelfPlayGame::Play(int white_threads, int black_threads, bool training,
     }
 
     auto node = tree_[idx]->GetCurrentHead();
-    Eval played_eval = best_eval;
+    classic::Eval played_eval = best_eval;
     Move move;
     while (true) {
       move = search_->GetBestMove().first;
@@ -265,9 +256,7 @@ void SelfPlayGame::Play(int white_threads, int black_threads, bool training,
       }
       PositionHistory history_copy = tree_[idx]->GetPositionHistory();
       Move move_for_history = move;
-      if (tree_[idx]->IsBlackToMove()) {
-        move_for_history.Mirror();
-      }
+      if (tree_[idx]->IsBlackToMove()) move_for_history.Flip();
       history_copy.Append(move_for_history);
       // Ensure not to discard games that are already decided.
       if (history_copy.ComputeGameResult() == GameResult::UNDECIDED) {
@@ -292,16 +281,25 @@ void SelfPlayGame::Play(int white_threads, int black_threads, bool training,
         }
       }
       // Append training data. The GameResult is later overwritten.
-      NNCacheLock nneval =
-          search_->GetCachedNNEval(tree_[idx]->GetCurrentHead());
+      std::vector<Move> legal_moves = tree_[idx]
+                                          ->GetPositionHistory()
+                                          .Last()
+                                          .GetBoard()
+                                          .GenerateLegalMoves();
+      std::optional<EvalResult> nneval =
+          options_[idx].backend->GetCachedEvaluation(EvalPosition{
+              tree_[idx]->GetPositionHistory().GetPositions(), legal_moves});
       training_data_.Add(tree_[idx]->GetCurrentHead(),
                          tree_[idx]->GetPositionHistory(), best_eval,
-                         played_eval, best_is_proof, best_move, move, nneval);
+                         played_eval, best_is_proof, best_move, move,
+                         legal_moves, nneval,
+                         search_->GetParams().GetPolicySoftmaxTemp());
     }
     // Must reset the search before mutating the tree.
     search_.reset();
 
     // Add best move to the tree.
+    if (tree_[0]->IsBlackToMove()) move.Flip();
     tree_[0]->MakeMove(move);
     if (tree_[0] != tree_[1]) tree_[1]->MakeMove(move);
     blacks_move = !blacks_move;
@@ -310,7 +308,7 @@ void SelfPlayGame::Play(int white_threads, int black_threads, bool training,
 
 std::vector<Move> SelfPlayGame::GetMoves() const {
   std::vector<Move> moves;
-  for (Node* node = tree_[0]->GetCurrentHead();
+  for (classic::Node* node = tree_[0]->GetCurrentHead();
        node != tree_[0]->GetGameBeginNode(); node = node->GetParent()) {
     moves.push_back(node->GetParent()->GetEdgeToNode(node)->GetMove());
   }
@@ -319,10 +317,9 @@ std::vector<Move> SelfPlayGame::GetMoves() const {
   while (!moves.empty()) {
     Move move = moves.back();
     moves.pop_back();
-    if (!chess960_) move = pos.GetBoard().GetLegacyMove(move);
     pos = Position(pos, move);
     // Position already flipped, therefore flip the move if white to move.
-    if (!pos.IsBlackToMove()) move.Mirror();
+    if (!pos.IsBlackToMove()) move.Flip();
     result.push_back(move);
   }
   return result;
@@ -355,18 +352,19 @@ void SelfPlayGame::WriteTrainingData(TrainingDataWriter* writer) const {
   training_data_.Write(writer, game_result_, adjudicated_);
 }
 
-std::unique_ptr<ChainedSearchStopper> SelfPlayLimits::MakeSearchStopper()
-    const {
-  auto result = std::make_unique<ChainedSearchStopper>();
+std::unique_ptr<classic::ChainedSearchStopper>
+SelfPlayLimits::MakeSearchStopper() const {
+  auto result = std::make_unique<classic::ChainedSearchStopper>();
 
   // always set VisitsStopper to avoid exceeding the limit 4000000000, the
   // default value when visits = 0
-  result->AddStopper(std::make_unique<VisitsStopper>(visits, false));
+  result->AddStopper(std::make_unique<classic::VisitsStopper>(visits, false));
   if (playouts >= 0) {
-    result->AddStopper(std::make_unique<PlayoutsStopper>(playouts, false));
+    result->AddStopper(
+        std::make_unique<classic::PlayoutsStopper>(playouts, false));
   }
   if (movetime >= 0) {
-    result->AddStopper(std::make_unique<TimeLimitStopper>(movetime));
+    result->AddStopper(std::make_unique<classic::TimeLimitStopper>(movetime));
   }
   return result;
 }
diff --git a/src/selfplay/game.h b/src/selfplay/game.h
index 918c328ce1..ff4d02539c 100644
--- a/src/selfplay/game.h
+++ b/src/selfplay/game.h
@@ -30,10 +30,9 @@
 #include "chess/pgn.h"
 #include "chess/position.h"
 #include "chess/uciloop.h"
-#include "mcts/search.h"
-#include "mcts/stoppers/stoppers.h"
-#include "neural/cache.h"
-#include "neural/network.h"
+#include "neural/backend.h"
+#include "search/classic/search.h"
+#include "search/classic/stoppers/stoppers.h"
 #include "trainingdata/trainingdata.h"
 #include "utils/optionsparser.h"
 
@@ -44,21 +43,19 @@ struct SelfPlayLimits {
   std::int64_t playouts = -1;
   std::int64_t movetime = -1;
 
-  std::unique_ptr<ChainedSearchStopper> MakeSearchStopper() const;
+  std::unique_ptr<classic::ChainedSearchStopper> MakeSearchStopper() const;
 };
 
 struct PlayerOptions {
   using OpeningCallback = std::function<void(const Opening&)>;
-  // Network to use by the player.
-  Network* network;
+  // Backend to use by the player.
+  Backend* backend;
   // Callback when player moves.
   CallbackUciResponder::BestMoveCallback best_move_callback;
   // Callback when player outputs info.
   CallbackUciResponder::ThinkingCallback info_callback;
   // Callback when player discards a selected move due to low visits.
   OpeningCallback discarded_callback;
-  // NNcache to use.
-  NNCache* cache;
   // User options dictionary.
   const OptionsDict* uci_options;
   // Limits to use for every move.
@@ -104,13 +101,13 @@ class SelfPlayGame {
   PlayerOptions options_[2];
   // Node tree for player1 and player2. If the tree is shared between players,
   // tree_[0] == tree_[1].
-  std::shared_ptr<NodeTree> tree_[2];
+  std::shared_ptr<classic::NodeTree> tree_[2];
   std::string orig_fen_;
   int start_ply_;
 
   // Search that is currently in progress. Stored in members so that Abort()
   // can stop it.
-  std::unique_ptr<Search> search_;
+  std::unique_ptr<classic::Search> search_;
   bool abort_ = false;
   GameResult game_result_ = GameResult::UNDECIDED;
   bool adjudicated_ = false;
diff --git a/src/selfplay/loop.cc b/src/selfplay/loop.cc
index dbf655e883..b1f3312a05 100644
--- a/src/selfplay/loop.cc
+++ b/src/selfplay/loop.cc
@@ -36,71 +36,34 @@
 namespace lczero {
 
 namespace {
-const OptionId kInteractiveId{
-    "interactive", "", "Run in interactive mode with UCI-like interface."};
-
 const OptionId kLogFileId{"logfile", "LogFile",
-  "Write log to that file. Special value <stderr> to "
-  "output the log to the console."};
+                          "Write log to that file. Special value <stderr> to "
+                          "output the log to the console."};
 }  // namespace
 
-
-SelfPlayLoop::SelfPlayLoop() {}
-
 SelfPlayLoop::~SelfPlayLoop() {
   if (tournament_) tournament_->Abort();
   if (thread_) thread_->join();
 }
 
-void SelfPlayLoop::RunLoop() {
+void SelfPlayLoop::Run() {
   SelfPlayTournament::PopulateOptions(&options_);
 
-  options_.Add<BoolOption>(kInteractiveId) = false;
   options_.Add<StringOption>(kLogFileId);
 
   if (!options_.ProcessAllFlags()) return;
-  
-  Logging::Get().SetFilename(options_.GetOptionsDict().Get<std::string>(kLogFileId));
-
-  if (options_.GetOptionsDict().Get<bool>(kInteractiveId)) {
-    UciLoop::RunLoop();
-  } else {
-    // Send id before starting tournament to allow wrapping client to know
-    // who we are.
-    SendId();
-    SelfPlayTournament tournament(
-        options_.GetOptionsDict(),
-        std::bind(&UciLoop::SendBestMove, this, std::placeholders::_1),
-        std::bind(&UciLoop::SendInfo, this, std::placeholders::_1),
-        std::bind(&SelfPlayLoop::SendGameInfo, this, std::placeholders::_1),
-        std::bind(&SelfPlayLoop::SendTournament, this, std::placeholders::_1));
-    tournament.RunBlocking();
-  }
-}
 
-void SelfPlayLoop::CmdUci() {
-  SendId();
-  for (const auto& option : options_.ListOptionsUci()) {
-    SendResponse(option);
-  }
-  SendResponse("uciok");
-}
+  Logging::Get().SetFilename(
+      options_.GetOptionsDict().Get<std::string>(kLogFileId));
 
-void SelfPlayLoop::CmdStart() {
-  if (tournament_) return;
-  tournament_ = std::make_unique<SelfPlayTournament>(
-      options_.GetOptionsDict(),
-      std::bind(&UciLoop::SendBestMove, this, std::placeholders::_1),
-      std::bind(&UciLoop::SendInfo, this, std::placeholders::_1),
+  // Send id before starting tournament to allow wrapping client to know
+  // who we are.
+  uci_responder_->SendId();
+  SelfPlayTournament tournament(
+      options_.GetOptionsDict(), uci_responder_,
       std::bind(&SelfPlayLoop::SendGameInfo, this, std::placeholders::_1),
       std::bind(&SelfPlayLoop::SendTournament, this, std::placeholders::_1));
-  thread_ =
-      std::make_unique<std::thread>([this]() { tournament_->RunBlocking(); });
-}
-
-void SelfPlayLoop::CmdStop() {
-  tournament_->Stop();
-  tournament_->Wait();
+  tournament.RunBlocking();
 }
 
 void SelfPlayLoop::SendGameInfo(const GameInfo& info) {
@@ -123,27 +86,20 @@ void SelfPlayLoop::SendGameInfo(const GameInfo& info) {
     res += " player1 " + std::string(*info.is_black ? "black" : "white");
   if (info.game_result != GameResult::UNDECIDED) {
     res += std::string(" result ") +
-           ((info.game_result == GameResult::DRAW)
-                ? "draw"
-                : (info.game_result == GameResult::WHITE_WON) ? "whitewon"
-                                                              : "blackwon");
+           ((info.game_result == GameResult::DRAW)        ? "draw"
+            : (info.game_result == GameResult::WHITE_WON) ? "whitewon"
+                                                          : "blackwon");
   }
   if (!info.moves.empty()) {
     res += " moves";
-    for (const auto& move : info.moves) res += " " + move.as_string();
+    for (const auto& move : info.moves) res += " " + move.ToString(true);
   }
   if (!info.initial_fen.empty() &&
       info.initial_fen != ChessBoard::kStartposFen) {
     res += " from_fen " + info.initial_fen;
   }
   responses.push_back(res);
-  SendResponses(responses);
-}
-
-void SelfPlayLoop::CmdSetOption(const std::string& name,
-                                const std::string& value,
-                                const std::string& context) {
-  options_.SetUciOption(name, value, context);
+  uci_responder_->SendRawResponses(responses);
 }
 
 void SelfPlayLoop::SendTournament(const TournamentInfo& info) {
@@ -195,7 +151,7 @@ void SelfPlayLoop::SendTournament(const TournamentInfo& info) {
                                   info.move_count_);
   oss << " nodes " + std::to_string(info.nodes_total_);
   oss << " moves " + std::to_string(info.move_count_);
-  SendResponse(oss.str());
+  uci_responder_->SendRawResponse(oss.str());
 }
 
 }  // namespace lczero
diff --git a/src/selfplay/loop.h b/src/selfplay/loop.h
index d48888b89b..36cdff2956 100644
--- a/src/selfplay/loop.h
+++ b/src/selfplay/loop.h
@@ -28,31 +28,31 @@
 #pragma once
 
 #include <thread>
+
 #include "chess/uciloop.h"
 #include "selfplay/tournament.h"
 #include "utils/optionsparser.h"
 
 namespace lczero {
 
-class SelfPlayLoop : public UciLoop {
+class SelfPlayLoop {
  public:
-  SelfPlayLoop();
+  SelfPlayLoop(StringUciResponder* uci_responder)
+      : uci_responder_(uci_responder) {
+    uci_responder_->PopulateParams(&options_);
+  }
   ~SelfPlayLoop();
 
-  void RunLoop() override;
-  void CmdStart() override;
-  void CmdStop() override;
-  void CmdUci() override;
-  void CmdSetOption(const std::string& name, const std::string& value,
-                    const std::string& context) override;
+  void Run();
 
  private:
   void SendGameInfo(const GameInfo& move);
   void SendTournament(const TournamentInfo& info);
 
   void EnsureOptionsSent();
-  OptionsParser options_;
 
+  StringUciResponder* uci_responder_ = nullptr;  // absl_notnull
+  OptionsParser options_;
   std::unique_ptr<SelfPlayTournament> tournament_;
   std::unique_ptr<std::thread> thread_;
 };
diff --git a/src/selfplay/multigame.cc b/src/selfplay/multigame.cc
index 1214cc9039..75a8070f74 100644
--- a/src/selfplay/multigame.cc
+++ b/src/selfplay/multigame.cc
@@ -32,95 +32,73 @@ namespace lczero {
 class PolicyEvaluator : public Evaluator {
  public:
   void Reset(const PlayerOptions& player) override {
-    comp = player.network->NewComputation();
-    input_format = player.network->GetCapabilities().input_format;
-    transforms.clear();
-    comp_idx = 0;
+    comp_ = player.backend->CreateComputation();
   }
-  void Gather(NodeTree* tree) override {
-    int transform;
-    auto planes =
-        EncodePositionForNN(input_format, tree->GetPositionHistory(), 8,
-                            FillEmptyHistory::FEN_ONLY, &transform);
-    transforms.push_back(transform);
-    comp->AddInput(std::move(planes));
-  }
-  void Run() override { comp->ComputeBlocking(); }
-  void MakeBestMove(NodeTree* tree) override {
-    Move best;
-    float max_p = std::numeric_limits<float>::lowest();
+  void Gather(classic::NodeTree* tree) override {
+    const auto& history = tree->GetPositionHistory();
+    moves_.clear();
     for (auto edge : tree->GetCurrentHead()->Edges()) {
-      float p = comp->GetPVal(comp_idx,
-                              edge.GetMove().as_nn_index(transforms[comp_idx]));
-      if (p >= max_p) {
-        max_p = p;
-        best = edge.GetMove(tree->GetPositionHistory().IsBlackToMove());
-      }
+      moves_.push_back(edge.GetMove());
     }
-    tree->MakeMove(best);
-    comp_idx++;
+    p_.resize(moves_.size());
+    comp_->AddInput(
+        EvalPosition{
+            .pos = history.GetPositions(),
+            .legal_moves = moves_,
+        },
+        EvalResultPtr{.p = p_});
+  }
+  void Run() override { comp_->ComputeBlocking(); }
+  void MakeBestMove(classic::NodeTree* tree) override {
+    size_t best_idx = std::max_element(p_.begin(), p_.end()) - p_.begin();
+    tree->MakeMove(moves_[best_idx]);
   }
 
-  std::unique_ptr<NetworkComputation> comp;
-  pblczero::NetworkFormat::InputFormat input_format;
-  int comp_idx;
-  std::vector<int> transforms;
+  std::unique_ptr<BackendComputation> comp_;
+  std::vector<Move> moves_;
+  std::vector<float> p_;
 };
 
 class ValueEvaluator : public Evaluator {
  public:
   void Reset(const PlayerOptions& player) override {
-    comp = player.network->NewComputation();
-    input_format = player.network->GetCapabilities().input_format;
-    comp_idx = 0;
-  }
-  void Gather(NodeTree* tree) override {
-    PositionHistory history = tree->GetPositionHistory();
-    for (auto edge : tree->GetCurrentHead()->Edges()) {
-      history.Append(edge.GetMove());
-      if (history.ComputeGameResult() == GameResult::UNDECIDED) {
-        int transform;
-        auto planes = EncodePositionForNN(
-            input_format, history, 8, FillEmptyHistory::FEN_ONLY, &transform);
-        comp->AddInput(std::move(planes));
-      }
-      history.Pop();
-    }
+    comp_ = player.backend->CreateComputation();
   }
-  void Run() override { comp->ComputeBlocking(); }
-  void MakeBestMove(NodeTree* tree) override {
-    Move best;
-    float max_q = std::numeric_limits<float>::lowest();
+  void Gather(classic::NodeTree* tree) override {
     PositionHistory history = tree->GetPositionHistory();
+    q_.clear();
+    q_.reserve(tree->GetCurrentHead()->GetNumEdges());
+    moves_.clear();
     for (auto edge : tree->GetCurrentHead()->Edges()) {
+      moves_.push_back(edge.GetMove());
       history.Append(edge.GetMove());
       auto result = history.ComputeGameResult();
-      float q = -1;
       if (result == GameResult::UNDECIDED) {
-        // NN eval is for side to move perspective - so if its good, its bad for
-        // us.
-        q = -comp->GetQVal(comp_idx);
-        comp_idx++;
+        comp_->AddInput(
+            EvalPosition{
+                .pos = history.GetPositions(),
+                .legal_moves = {},
+            },
+            EvalResultPtr{.q = &q_.emplace_back()});
       } else if (result == GameResult::DRAW) {
-        q = 0;
+        q_.push_back(0);
       } else {
         // A legal move to a non-drawn terminal without tablebases must be a
         // win.
-        q = 1;
-      }
-      if (q >= max_q) {
-        max_q = q;
-        best = edge.GetMove(tree->GetPositionHistory().IsBlackToMove());
+        q_.push_back(1);
       }
       history.Pop();
     }
-    tree->MakeMove(best);
+  }
+  void Run() override { comp_->ComputeBlocking(); }
+  void MakeBestMove(classic::NodeTree* tree) override {
+    size_t best_idx = std::max_element(q_.begin(), q_.end()) - q_.begin();
+    tree->MakeMove(moves_[best_idx]);
   }
 
-  std::unique_ptr<NetworkComputation> comp;
-  pblczero::NetworkFormat::InputFormat input_format;
-  int comp_idx;
-  std::vector<int> transforms;
+  std::unique_ptr<BackendComputation> comp_;
+  std::vector<Move> moves_;
+  std::vector<float> q_;
 };
 
 MultiSelfPlayGames::MultiSelfPlayGames(PlayerOptions player1,
@@ -134,7 +112,7 @@ MultiSelfPlayGames::MultiSelfPlayGames(PlayerOptions player1,
               : std::unique_ptr<Evaluator>(std::make_unique<PolicyEvaluator>());
   trees_.reserve(openings.size());
   for (auto opening : openings) {
-    trees_.push_back(std::make_shared<NodeTree>());
+    trees_.push_back(std::make_shared<classic::NodeTree>());
     trees_.back()->ResetToPosition(opening.start_fen, {});
     results_.push_back(GameResult::UNDECIDED);
 
diff --git a/src/selfplay/multigame.h b/src/selfplay/multigame.h
index 122eb71bf5..ace4b2dcb4 100644
--- a/src/selfplay/multigame.h
+++ b/src/selfplay/multigame.h
@@ -37,11 +37,11 @@ class Evaluator {
   // Run before each batch before any Gather.
   virtual void Reset(const PlayerOptions& player) = 0;
   // Run for each tree.
-  virtual void Gather(NodeTree* tree) = 0;
+  virtual void Gather(classic::NodeTree* tree) = 0;
   // Run once between Gather and Move.
   virtual void Run() = 0;
   // Run for each tree in the same order as Gather.
-  virtual void MakeBestMove(NodeTree* tree) = 0;
+  virtual void MakeBestMove(classic::NodeTree* tree) = 0;
 };
 
 // Plays a bunch of games vs itself.
@@ -63,7 +63,7 @@ class MultiSelfPlayGames {
   std::vector<Move> GetMoves(int index) const {
     std::vector<Move> moves;
     bool flip = !trees_[index]->IsBlackToMove();
-    for (Node* node = trees_[index]->GetCurrentHead();
+    for (classic::Node* node = trees_[index]->GetCurrentHead();
          node != trees_[index]->GetGameBeginNode(); node = node->GetParent()) {
       moves.push_back(node->GetParent()->GetEdgeToNode(node)->GetMove(flip));
       flip = !flip;
@@ -77,7 +77,7 @@ class MultiSelfPlayGames {
   PlayerOptions options_[2];
   // Node tree for player1 and player2. If the tree is shared between players,
   // tree_[0] == tree_[1].
-  std::vector<std::shared_ptr<NodeTree>> trees_;
+  std::vector<std::shared_ptr<classic::NodeTree>> trees_;
   std::vector<GameResult> results_;
   bool abort_ = false;
   std::mutex mutex_;
diff --git a/src/selfplay/tournament.cc b/src/selfplay/tournament.cc
index 993e449a23..73871b75cf 100644
--- a/src/selfplay/tournament.cc
+++ b/src/selfplay/tournament.cc
@@ -30,9 +30,10 @@
 #include <fstream>
 
 #include "chess/pgn.h"
-#include "mcts/search.h"
-#include "mcts/stoppers/factory.h"
-#include "neural/factory.h"
+#include "neural/memcache.h"
+#include "neural/shared_params.h"
+#include "search/classic/search.h"
+#include "search/classic/stoppers/factory.h"
 #include "selfplay/game.h"
 #include "selfplay/multigame.h"
 #include "utils/optionsparser.h"
@@ -110,10 +111,9 @@ void SelfPlayTournament::PopulateOptions(OptionsParser* options) {
     dict->AddSubdict("black")->AddAliasDict(&options->GetOptionsDict("black"));
   }
 
-  NetworkFactory::PopulateOptions(options);
+  SharedBackendParams::Populate(options);
   options->Add<IntOption>(kThreadsId, 1, 8) = 1;
-  options->Add<IntOption>(kNNCacheSizeId, 0, 999999999) = 2000000;
-  SearchParams::Populate(options);
+  classic::SearchParams::Populate(options);
 
   options->Add<BoolOption>(kShareTreesId) = true;
   options->Add<IntOption>(kTotalGamesId, -2, 999999) = -1;
@@ -139,35 +139,33 @@ void SelfPlayTournament::PopulateOptions(OptionsParser* options) {
   SelfPlayGame::PopulateUciParams(options);
 
   auto defaults = options->GetMutableDefaultsOptions();
-  defaults->Set<int>(SearchParams::kMiniBatchSizeId, 32);
-  defaults->Set<float>(SearchParams::kCpuctId, 1.2f);
-  defaults->Set<float>(SearchParams::kCpuctFactorId, 0.0f);
-  defaults->Set<float>(SearchParams::kPolicySoftmaxTempId, 1.0f);
-  defaults->Set<int>(SearchParams::kMaxCollisionVisitsId, 1);
-  defaults->Set<int>(SearchParams::kMaxCollisionEventsId, 1);
-  defaults->Set<int>(SearchParams::kCacheHistoryLengthId, 7);
-  defaults->Set<bool>(SearchParams::kOutOfOrderEvalId, false);
-  defaults->Set<float>(SearchParams::kTemperatureId, 1.0f);
-  defaults->Set<float>(SearchParams::kNoiseEpsilonId, 0.25f);
-  defaults->Set<float>(SearchParams::kFpuValueId, 0.0f);
-  defaults->Set<std::string>(SearchParams::kHistoryFillId, "no");
-  defaults->Set<std::string>(NetworkFactory::kBackendId, "multiplexing");
-  defaults->Set<bool>(SearchParams::kStickyEndgamesId, false);
-  defaults->Set<bool>(SearchParams::kTwoFoldDrawsId, false);
-  defaults->Set<int>(SearchParams::kTaskWorkersPerSearchWorkerId, 0);
+  defaults->Set<int>(classic::SearchParams::kMiniBatchSizeId, 32);
+  defaults->Set<float>(classic::SearchParams::kCpuctId, 1.2f);
+  defaults->Set<float>(classic::SearchParams::kCpuctFactorId, 0.0f);
+  defaults->Set<float>(SharedBackendParams::kPolicySoftmaxTemp, 1.0f);
+  defaults->Set<int>(classic::SearchParams::kMaxCollisionVisitsId, 1);
+  defaults->Set<int>(classic::SearchParams::kMaxCollisionEventsId, 1);
+  defaults->Set<int>(classic::SearchParams::kCacheHistoryLengthId, 7);
+  defaults->Set<bool>(classic::SearchParams::kOutOfOrderEvalId, false);
+  defaults->Set<float>(classic::SearchParams::kTemperatureId, 1.0f);
+  defaults->Set<float>(classic::SearchParams::kNoiseEpsilonId, 0.25f);
+  defaults->Set<float>(classic::SearchParams::kFpuValueId, 0.0f);
+  defaults->Set<std::string>(SharedBackendParams::kHistoryFill, "no");
+  defaults->Set<std::string>(SharedBackendParams::kBackendId, "multiplexing");
+  defaults->Set<bool>(classic::SearchParams::kStickyEndgamesId, false);
+  defaults->Set<bool>(classic::SearchParams::kTwoFoldDrawsId, false);
+  defaults->Set<int>(classic::SearchParams::kTaskWorkersPerSearchWorkerId, 0);
 }
 
-SelfPlayTournament::SelfPlayTournament(
-    const OptionsDict& options,
-    CallbackUciResponder::BestMoveCallback best_move_info,
-    CallbackUciResponder::ThinkingCallback thinking_info,
-    GameInfo::Callback game_info, TournamentInfo::Callback tournament_info)
+SelfPlayTournament::SelfPlayTournament(const OptionsDict& options,
+                                       UciResponder* uci_responder,
+                                       GameInfo::Callback game_info,
+                                       TournamentInfo::Callback tournament_info)
     : player_options_{{options.GetSubdict("player1").GetSubdict("white"),
                        options.GetSubdict("player1").GetSubdict("black")},
                       {options.GetSubdict("player2").GetSubdict("white"),
                        options.GetSubdict("player2").GetSubdict("black")}},
-      best_move_callback_(best_move_info),
-      info_callback_(thinking_info),
+      uci_responder_(uci_responder),
       game_callback_(game_info),
       tournament_callback_(tournament_info),
       kTotalGames(options.Get<int>(kTotalGamesId)),
@@ -212,30 +210,32 @@ SelfPlayTournament::SelfPlayTournament(
     first_game_black_ = Random::Get().GetBool();
   }
 
+  static constexpr const char* kPlayerNames[2] = {"player1", "player2"};
+  static constexpr const char* kPlayerColors[2] = {"white", "black"};
+
   // Initializing networks.
-  for (const auto& name : {"player1", "player2"}) {
-    for (const auto& color : {"white", "black"}) {
+  std::vector<std::shared_ptr<Backend>> backend_list;
+  for (int name_idx : {0, 1}) {
+    for (int color_idx : {0, 1}) {
+      const auto& name = kPlayerNames[name_idx];
+      const auto& color = kPlayerColors[color_idx];
       const auto& opts = options.GetSubdict(name).GetSubdict(color);
-      const auto config = NetworkFactory::BackendConfiguration(opts);
-      if (networks_.find(config) == networks_.end()) {
-        networks_.emplace(config, NetworkFactory::LoadNetwork(opts));
+      for (const auto& backend : backend_list) {
+        if (backend->IsSameConfiguration(opts)) {
+          backends_[name_idx][color_idx] = backend;
+          break;
+        }
+      }
+      if (!backends_[name_idx][color_idx]) {
+        backends_[name_idx][color_idx] =
+            CreateMemCache(BackendManager::Get()->CreateFromParams(opts),
+                           options.GetSubdict(name));
+        backend_list.emplace_back(backends_[name_idx][color_idx]);
       }
     }
   }
 
-  // Initializing cache.
-  cache_[0] = std::make_shared<NNCache>(
-      options.GetSubdict("player1").Get<int>(kNNCacheSizeId));
-  if (kShareTree) {
-    cache_[1] = cache_[0];
-  } else {
-    cache_[1] = std::make_shared<NNCache>(
-        options.GetSubdict("player2").Get<int>(kNNCacheSizeId));
-  }
-
   // SearchLimits.
-  static constexpr const char* kPlayerNames[2] = {"player1", "player2"};
-  static constexpr const char* kPlayerColors[2] = {"white", "black"};
   for (int name_idx : {0, 1}) {
     for (int color_idx : {0, 1}) {
       auto& limits = search_limits_[name_idx][color_idx];
@@ -311,10 +311,7 @@ void SelfPlayTournament::PlayOneGame(int game_number) {
         player_options_[pl_idx][color].Get<bool>(kMoveThinkingId);
     // Populate per-player options.
     PlayerOptions& opt = options[color_idx[pl_idx]];
-    opt.network = networks_[NetworkFactory::BackendConfiguration(
-                                player_options_[pl_idx][color])]
-                      .get();
-    opt.cache = cache_[pl_idx].get();
+    opt.backend = backends_[pl_idx][color].get();
     opt.uci_options = &player_options_[pl_idx][color];
     opt.search_limits = search_limits_[pl_idx][color];
 
@@ -328,14 +325,14 @@ void SelfPlayTournament::PlayOneGame(int game_number) {
       }
       // In non-verbose mode, output the last "info" message.
       if (!verbose_thinking && !last_thinking_info.empty()) {
-        info_callback_(last_thinking_info);
+        uci_responder_->OutputThinkingInfo(&last_thinking_info);
         last_thinking_info.clear();
       }
       BestMoveInfo rich_info = info;
       rich_info.player = pl_idx + 1;
       rich_info.is_black = player1_black ? pl_idx == 0 : pl_idx != 0;
       rich_info.game_id = game_number;
-      best_move_callback_(rich_info);
+      uci_responder_->OutputBestMove(&rich_info);
     };
 
     opt.info_callback =
@@ -348,7 +345,7 @@ void SelfPlayTournament::PlayOneGame(int game_number) {
             info.game_id = game_number;
           }
           if (verbose_thinking) {
-            info_callback_(rich_info);
+            uci_responder_->OutputThinkingInfo(&rich_info);
           } else {
             // In non-verbose mode, remember the last "info" messages.
             last_thinking_info = std::move(rich_info);
@@ -452,12 +449,8 @@ void SelfPlayTournament::PlayMultiGames(int game_id, size_t game_count) {
   }
 
   PlayerOptions options[2];
-  options[0].network =
-      networks_[NetworkFactory::BackendConfiguration(player_options_[0][0])]
-          .get();
-  options[1].network =
-      networks_[NetworkFactory::BackendConfiguration(player_options_[1][1])]
-          .get();
+  options[0].backend = backends_[0][0].get();
+  options[1].backend = backends_[1][1].get();
 
   std::list<std::unique_ptr<MultiSelfPlayGames>>::iterator game1_iter;
   auto aborted = false;
@@ -473,12 +466,8 @@ void SelfPlayTournament::PlayMultiGames(int game_id, size_t game_count) {
   // PLAY GAMEs!
   if (!aborted) game1.Play();
 
-  options[0].network =
-      networks_[NetworkFactory::BackendConfiguration(player_options_[0][1])]
-          .get();
-  options[1].network =
-      networks_[NetworkFactory::BackendConfiguration(player_options_[1][0])]
-          .get();
+  options[0].backend = backends_[0][1].get();
+  options[1].backend = backends_[1][0].get();
 
   std::list<std::unique_ptr<MultiSelfPlayGames>>::iterator game2_iter;
   {
@@ -658,9 +647,9 @@ void SelfPlayTournament::SaveResults() {
   if (kTournamentResultsFile.empty()) return;
   std::ofstream output(kTournamentResultsFile, std::ios_base::app);
   auto p1name =
-      player_options_[0][0].Get<std::string>(NetworkFactory::kWeightsId);
+      player_options_[0][0].Get<std::string>(SharedBackendParams::kWeightsId);
   auto p2name =
-      player_options_[1][0].Get<std::string>(NetworkFactory::kWeightsId);
+      player_options_[1][0].Get<std::string>(SharedBackendParams::kWeightsId);
 
   output << std::endl;
   output << "[White \"" << p1name << "\"]" << std::endl;
diff --git a/src/selfplay/tournament.h b/src/selfplay/tournament.h
index d695570a2e..f9e2dd9733 100644
--- a/src/selfplay/tournament.h
+++ b/src/selfplay/tournament.h
@@ -30,7 +30,8 @@
 #include <list>
 
 #include "chess/pgn.h"
-#include "neural/factory.h"
+#include "neural/backend.h"
+#include "neural/register.h"
 #include "selfplay/game.h"
 #include "selfplay/multigame.h"
 #include "utils/mutex.h"
@@ -42,9 +43,7 @@ namespace lczero {
 // Runs many selfplay games, possibly in parallel.
 class SelfPlayTournament {
  public:
-  SelfPlayTournament(const OptionsDict& options,
-                     CallbackUciResponder::BestMoveCallback best_move_info,
-                     CallbackUciResponder::ThinkingCallback thinking_info,
+  SelfPlayTournament(const OptionsDict& options, UciResponder* uci_responder,
                      GameInfo::Callback game_info,
                      TournamentInfo::Callback tournament_info);
 
@@ -95,16 +94,12 @@ class SelfPlayTournament {
   Mutex threads_mutex_;
   std::vector<std::thread> threads_ GUARDED_BY(threads_mutex_);
 
-  // Map from the backend configuration to a network.
-  std::map<NetworkFactory::BackendConfiguration, std::unique_ptr<Network>>
-      networks_;
-  std::shared_ptr<NNCache> cache_[2];
   // [player1 or player2][white or black].
+  std::shared_ptr<Backend> backends_[2][2];
   const OptionsDict player_options_[2][2];
   SelfPlayLimits search_limits_[2][2];
 
-  CallbackUciResponder::BestMoveCallback best_move_callback_;
-  CallbackUciResponder::ThinkingCallback info_callback_;
+  UciResponder* uci_responder_;
   GameInfo::Callback game_callback_;
   TournamentInfo::Callback tournament_callback_;
   const int kTotalGames;
diff --git a/src/syzygy/syzygy.cc b/src/syzygy/syzygy.cc
index 115164b772..75bd302dea 100644
--- a/src/syzygy/syzygy.cc
+++ b/src/syzygy/syzygy.cc
@@ -31,6 +31,8 @@
   Program grant you additional permission to convey the resulting work.
 */
 
+#include "syzygy/syzygy.h"
+
 #include <atomic>
 #include <cstdint>
 #include <cstdio>
@@ -42,8 +44,6 @@
 #include <sstream>
 #include <string>
 
-#include "syzygy/syzygy.h"
-
 #include "utils/exception.h"
 #include "utils/logging.h"
 #include "utils/mutex.h"
@@ -228,7 +228,7 @@ bool is_capture(const ChessBoard& pos, const Move& move) {
   // Simple capture.
   if (pos.theirs().get(move.to())) return true;
   // Enpassant capture. Pawn moves other than straight it must be a capture.
-  if (pos.pawns().get(move.from()) && move.from().col() != move.to().col()) {
+  if (pos.pawns().get(move.from()) && move.from().file() != move.to().file()) {
     return true;
   }
   return false;
@@ -272,8 +272,12 @@ int num_tables(BaseEntry* be, const int type) {
 }
 
 EncInfo* first_ei(BaseEntry* be, const int type) {
-  return be->hasPawns ? &PAWN(be)->ei[type == WDL ? 0 : type == DTM ? 8 : 20]
-                      : &PIECE(be)->ei[type == WDL ? 0 : type == DTM ? 2 : 4];
+  return be->hasPawns ? &PAWN(be)->ei[type == WDL   ? 0
+                                      : type == DTM ? 8
+                                                    : 20]
+                      : &PIECE(be)->ei[type == WDL   ? 0
+                                       : type == DTM ? 2
+                                                     : 4];
 }
 
 constexpr int8_t kOffDiag[] = {
@@ -639,10 +643,10 @@ size_t init_enc_info(EncInfo* ei, BaseEntry* be, uint8_t* tb, int shift, int t,
   for (int i = 0; k < be->num || i == order || i == order2; i++) {
     if (i == order) {
       ei->factor[0] = f;
-      f *= enc == FILE_ENC
-               ? PawnFactorFile[ei->norm[0] - 1][t]
-               : enc == RANK_ENC ? PawnFactorRank[ei->norm[0] - 1][t]
-                                 : be->kk_enc ? 462 : 31332;
+      f *= enc == FILE_ENC   ? PawnFactorFile[ei->norm[0] - 1][t]
+           : enc == RANK_ENC ? PawnFactorRank[ei->norm[0] - 1][t]
+           : be->kk_enc      ? 462
+                             : 31332;
     } else if (i == order2) {
       ei->factor[ei->norm[0]] = f;
       f *= subfactor(ei->norm[ei->norm[0]], 48 - ei->norm[0]);
@@ -851,7 +855,7 @@ int fill_squares(const ChessBoard& pos, uint8_t* pc, bool flip, int mirror,
   BitBoard bb = pieces(pos, pc[i] & 7,
                        static_cast<bool>((pc[i] >> 3)) ^ flip ^ pos.flipped());
   for (auto sq : bb) {
-    p[i++] = sq.as_int() ^ mirror;
+    p[i++] = sq.as_idx() ^ mirror;
   }
   return i;
 }
@@ -872,20 +876,20 @@ class SyzygyTablebaseImpl {
     char str[33];
 
     for (int i = 0; i < 5; i++) {
-      sprintf(str, "K%cvK", pchr(i));
+      snprintf(str, 5, "K%cvK", pchr(i));
       init_tb(str);
     }
 
     for (int i = 0; i < 5; i++) {
       for (int j = i; j < 5; j++) {
-        sprintf(str, "K%cvK%c", pchr(i), pchr(j));
+        snprintf(str, 6, "K%cvK%c", pchr(i), pchr(j));
         init_tb(str);
       }
     }
 
     for (int i = 0; i < 5; i++) {
       for (int j = i; j < 5; j++) {
-        sprintf(str, "K%c%cvK", pchr(i), pchr(j));
+        snprintf(str, 6, "K%c%cvK", pchr(i), pchr(j));
         init_tb(str);
       }
     }
@@ -893,7 +897,7 @@ class SyzygyTablebaseImpl {
     for (int i = 0; i < 5; i++) {
       for (int j = i; j < 5; j++) {
         for (int k = 0; k < 5; k++) {
-          sprintf(str, "K%c%cvK%c", pchr(i), pchr(j), pchr(k));
+          snprintf(str, 7, "K%c%cvK%c", pchr(i), pchr(j), pchr(k));
           init_tb(str);
         }
       }
@@ -902,7 +906,7 @@ class SyzygyTablebaseImpl {
     for (int i = 0; i < 5; i++) {
       for (int j = i; j < 5; j++) {
         for (int k = j; k < 5; k++) {
-          sprintf(str, "K%c%c%cvK", pchr(i), pchr(j), pchr(k));
+          snprintf(str, 7, "K%c%c%cvK", pchr(i), pchr(j), pchr(k));
           init_tb(str);
         }
       }
@@ -915,7 +919,7 @@ class SyzygyTablebaseImpl {
       for (int j = i; j < 5; j++) {
         for (int k = i; k < 5; k++) {
           for (int l = (i == k) ? j : k; l < 5; l++) {
-            sprintf(str, "K%c%cvK%c%c", pchr(i), pchr(j), pchr(k), pchr(l));
+            snprintf(str, 8, "K%c%cvK%c%c", pchr(i), pchr(j), pchr(k), pchr(l));
             init_tb(str);
           }
         }
@@ -926,7 +930,7 @@ class SyzygyTablebaseImpl {
       for (int j = i; j < 5; j++) {
         for (int k = j; k < 5; k++) {
           for (int l = 0; l < 5; l++) {
-            sprintf(str, "K%c%c%cvK%c", pchr(i), pchr(j), pchr(k), pchr(l));
+            snprintf(str, 8, "K%c%c%cvK%c", pchr(i), pchr(j), pchr(k), pchr(l));
             init_tb(str);
           }
         }
@@ -937,7 +941,7 @@ class SyzygyTablebaseImpl {
       for (int j = i; j < 5; j++) {
         for (int k = j; k < 5; k++) {
           for (int l = k; l < 5; l++) {
-            sprintf(str, "K%c%c%c%cvK", pchr(i), pchr(j), pchr(k), pchr(l));
+            snprintf(str, 8, "K%c%c%c%cvK", pchr(i), pchr(j), pchr(k), pchr(l));
             init_tb(str);
           }
         }
@@ -951,8 +955,8 @@ class SyzygyTablebaseImpl {
         for (int k = j; k < 5; k++) {
           for (int l = k; l < 5; l++) {
             for (int m = l; m < 5; m++) {
-              sprintf(str, "K%c%c%c%c%cvK", pchr(i), pchr(j), pchr(k), pchr(l),
-                      pchr(m));
+              snprintf(str, 9, "K%c%c%c%c%cvK", pchr(i), pchr(j), pchr(k),
+                       pchr(l), pchr(m));
               init_tb(str);
             }
           }
@@ -965,8 +969,8 @@ class SyzygyTablebaseImpl {
         for (int k = j; k < 5; k++) {
           for (int l = k; l < 5; l++) {
             for (int m = 0; m < 5; m++) {
-              sprintf(str, "K%c%c%c%cvK%c", pchr(i), pchr(j), pchr(k), pchr(l),
-                      pchr(m));
+              snprintf(str, 9, "K%c%c%c%cvK%c", pchr(i), pchr(j), pchr(k),
+                       pchr(l), pchr(m));
               init_tb(str);
             }
           }
@@ -979,8 +983,8 @@ class SyzygyTablebaseImpl {
         for (int k = j; k < 5; k++) {
           for (int l = 0; l < 5; l++) {
             for (int m = l; m < 5; m++) {
-              sprintf(str, "K%c%c%cvK%c%c", pchr(i), pchr(j), pchr(k), pchr(l),
-                      pchr(m));
+              snprintf(str, 9, "K%c%c%cvK%c%c", pchr(i), pchr(j), pchr(k),
+                       pchr(l), pchr(m));
               init_tb(str);
             }
           }
@@ -1075,7 +1079,8 @@ class SyzygyTablebaseImpl {
     *mapping = mmap;
     base_address = MapViewOfFile(mmap, FILE_MAP_READ, 0, 0, 0);
     if (!base_address) {
-      throw Exception("MapViewOfFile() failed, name = " + fname + ", error = " + std::to_string(GetLastError()));
+      throw Exception("MapViewOfFile() failed, name = " + fname +
+                      ", error = " + std::to_string(GetLastError()));
     }
 #endif
     return base_address;
@@ -1201,7 +1206,9 @@ class SyzygyTablebaseImpl {
     size_t tb_size[6][2];
     const int num = num_tables(be, type);
     EncInfo* ei = first_ei(be, type);
-    const int enc = !be->hasPawns ? PIECE_ENC : type != DTM ? FILE_ENC : RANK_ENC;
+    const int enc = !be->hasPawns ? PIECE_ENC
+                    : type != DTM ? FILE_ENC
+                                  : RANK_ENC;
 
     for (int t = 0; t < num; t++) {
       tb_size[t][0] = init_enc_info(&ei[t], be, data, 0, t, enc);
@@ -1399,8 +1406,9 @@ class SyzygyTablebaseImpl {
           return 0;
         }
       }
-      ei = type == WDL ? &ei[t + 4 * bside]
-                       : type == DTM ? &ei[t + 6 * bside] : &ei[t];
+      ei = type == WDL   ? &ei[t + 4 * bside]
+           : type == DTM ? &ei[t + 6 * bside]
+                         : &ei[t];
       while (i < be->num) {
         i = fill_squares(pos, ei->pieces, flip, flip ? 0x38 : 0, p, i);
       }
@@ -1658,11 +1666,10 @@ bool SyzygyTablebase::root_probe(const Position& pos, bool has_repeated,
     if (result == FAIL) return false;
     // Better moves are ranked higher. Certain wins are ranked equally.
     // Losing moves are ranked equally unless a 50-move draw is in sight.
-    int r = dtz > 0
-                ? (dtz + cnt50 <= 99 && !rep ? 1000 : 1000 - (dtz + cnt50))
-                : dtz < 0 ? (-dtz * 2 + cnt50 < 100 ? -1000
-                                                    : -1000 + (-dtz + cnt50))
-                          : 0;
+    int r = dtz > 0 ? (dtz + cnt50 <= 99 && !rep ? 1000 : 1000 - (dtz + cnt50))
+            : dtz < 0
+                ? (-dtz * 2 + cnt50 < 100 ? -1000 : -1000 + (-dtz + cnt50))
+                : 0;
     if (r > best_rank) best_rank = r;
     ranks.push_back(r);
   }
diff --git a/src/syzygy/syzygy_test.cc b/src/syzygy/syzygy_test.cc
index 06f1381283..c16796796b 100644
--- a/src/syzygy/syzygy_test.cc
+++ b/src/syzygy/syzygy_test.cc
@@ -25,22 +25,23 @@
   Program grant you additional permission to convey the resulting work.
 */
 
+#include "src/syzygy/syzygy.h"
+
 #include <gtest/gtest.h>
 
 #include <iostream>
-#include "src/syzygy/syzygy.h"
 
 namespace lczero {
 
 // Try to find syzygy relative to current working directory.
 constexpr auto kPaths = "syzygy";
 
-void TestValidRootExpectation(SyzygyTablebase* tablebase,
-                              const std::string& fen,
-                              const MoveList& valid_moves,
-                              const MoveList& invalid_moves,
-                              const MoveList& invalid_dtz_only = {},
-                              bool has_repeated = false) {
+void TestValidRootExpectation(
+    SyzygyTablebase* tablebase, const std::string& fen,
+    const std::vector<std::string>& valid_moves,
+    const std::vector<std::string>& invalid_moves,
+    const std::vector<std::string>& invalid_dtz_only = {},
+    bool has_repeated = false) {
   ChessBoard board;
   PositionHistory history;
   int rule50ply;
@@ -52,19 +53,22 @@ void TestValidRootExpectation(SyzygyTablebase* tablebase,
                         &allowed_moves_dtz);
   MoveList allowed_moves_wdl;
   tablebase->root_probe_wdl(history.Last(), &allowed_moves_wdl);
-  for (auto move : valid_moves) {
+  for (auto move_str : valid_moves) {
+    Move move = board.ParseMove(move_str);
     EXPECT_TRUE(std::find(allowed_moves_dtz.begin(), allowed_moves_dtz.end(),
                           move) != allowed_moves_dtz.end());
     EXPECT_TRUE(std::find(allowed_moves_wdl.begin(), allowed_moves_wdl.end(),
                           move) != allowed_moves_wdl.end());
   }
-  for (auto move : invalid_moves) {
+  for (auto move_str : invalid_moves) {
+    Move move = board.ParseMove(move_str);
     EXPECT_FALSE(std::find(allowed_moves_dtz.begin(), allowed_moves_dtz.end(),
                            move) != allowed_moves_dtz.end());
     EXPECT_FALSE(std::find(allowed_moves_wdl.begin(), allowed_moves_wdl.end(),
                            move) != allowed_moves_wdl.end());
   }
-  for (auto move : invalid_dtz_only) {
+  for (auto move_str : invalid_dtz_only) {
+    Move move = board.ParseMove(move_str);
     EXPECT_FALSE(std::find(allowed_moves_dtz.begin(), allowed_moves_dtz.end(),
                            move) != allowed_moves_dtz.end());
     EXPECT_TRUE(std::find(allowed_moves_wdl.begin(), allowed_moves_wdl.end(),
@@ -135,15 +139,15 @@ TEST(Syzygy, Root3PieceProbes) {
     return;
   }
   TestValidRootExpectation(&tablebase, "5Qk1/8/8/8/8/8/8/4K3 b - - 0 1",
-                           {Move("g8f8", true)}, {Move("g8h7", true)});
+                           {"g8f8"}, {"g8h7"});
   TestValidRootExpectation(&tablebase, "6k1/8/8/8/8/5p2/8/2K5 b - - 0 1",
-                           {Move("f3f2", true)}, {Move("g8h7", true)});
+                           {"f3f2"}, {"g8h7"});
   TestValidRootExpectation(&tablebase, "8/8/8/8/8/k1p5/8/3K4 b - - 0 1",
-                           {Move("a3b3", true)}, {Move("c3c2", true)});
+                           {"a3b3"}, {"c3c2"});
   // WDL doesn't know that with such a high 50 ply count this position has
   // become a blessed loss (draw) for black.
   TestValidRootExpectation(&tablebase, "8/8/8/8/8/8/2Rk4/1K6 b - - 69 71",
-                           {Move("d2d3", true)}, {}, {Move("d2e3", true)});
+                           {"d2d3"}, {}, {"d2e3"});
 }
 
 TEST(Syzygy, Simple4PieceProbes) {
@@ -227,18 +231,17 @@ TEST(Syzygy, Root5PieceProbes) {
     return;
   }
   TestValidRootExpectation(&tablebase, "8/8/8/Q7/8/1k1K4/1r6/8 w - - 79 44",
-                           {Move("a5a1", false)}, {}, {Move("a5d5", false)});
+                           {"a5a1"}, {}, {"a5d5"});
   TestValidRootExpectation(&tablebase, "8/8/8/3Q4/k7/3K4/1r6/8 w - - 81 45",
-                           {Move("d5a8", false)}, {}, {Move("d3c3", false)});
+                           {"d5a8"}, {}, {"d3c3"});
 
   // Variant of first test but with plenty of moves left.
   TestValidRootExpectation(&tablebase, "8/8/8/Q7/8/1k1K4/1r6/8 w - - 60 44",
-                           {Move("a5a1", false), Move("a5d5", false)}, {}, {});
+                           {"a5a1", "a5d5"}, {}, {});
   // Same, but this time there is a repetition in history, so dtz will enforce
   // choice of equal lowest dtz.
   TestValidRootExpectation(&tablebase, "8/8/8/Q7/8/1k1K4/1r6/8 w - - 60 44",
-                           {Move("a5a1", false)}, {}, {Move("a5d5", false)},
-                           true);
+                           {"a5a1"}, {}, {"a5d5"}, true);
 }
 
 }  // namespace lczero
diff --git a/src/tools/backendbench.cc b/src/tools/backendbench.cc
new file mode 100644
index 0000000000..ff048048e5
--- /dev/null
+++ b/src/tools/backendbench.cc
@@ -0,0 +1,285 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2020-2021 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include "tools/backendbench.h"
+
+#include "chess/board.h"
+#include "neural/register.h"
+#include "neural/shared_params.h"
+#include "search/classic/node.h"
+#include "utils/optionsparser.h"
+
+namespace lczero {
+namespace {
+const int kDefaultThreads = 1;
+
+const OptionId kThreadsOptionId{"threads", "Threads",
+                                "Number of (CPU) worker threads to use.", 't'};
+const OptionId kBatchesId{"batches", "",
+                          "Number of batches to run as a benchmark."};
+const OptionId kStartBatchSizeId{"start-batch-size", "",
+                                 "Start benchmark from this batch size."};
+const OptionId kMaxBatchSizeId{"max-batch-size", "",
+                               "Maximum batch size to benchmark."};
+const OptionId kBatchStepId{"batch-step", "",
+                            "Step of batch size in benchmark."};
+const OptionId kHeaderOnlyOnceId{"header-only-once", "",
+                                 "Print CSV header only once."};
+const OptionId kFenId{"fen", "", "Benchmark initial position FEN."};
+
+const OptionId kClippyId{"clippy", "", "Enable helpful assistant."};
+
+void Clippy(std::string title, std::string msg3, std::string best3,
+            std::string msg2, std::string best2, std::string msg,
+            std::string best) {
+  std::cout << "  __" << std::endl;
+  std::cout << " /  \\" << std::endl;
+  std::cout << " |  |    " << std::string(title.length() + 2, '_') << std::endl;
+  std::cout << " +  +   | " << std::string(title.length() + 1, ' ') << "|"
+            << std::endl;
+  std::cout << "(@)(@) _| " << title << " |" << std::endl;
+  std::cout << " |  |  \\  " << std::string(6, ' ') << msg3
+            << std::string(4 - best3.length(), ' ') << best3
+            << std::string(title.length() - 33, ' ') << "|" << std::endl;
+  std::cout << " || |/  | " << std::string(6, ' ') << msg2
+            << std::string(4 - best2.length(), ' ') << best2
+            << std::string(title.length() - 33, ' ') << "|" << std::endl;
+  std::cout << " || ||  | " << std::string(6, ' ') << msg
+            << std::string(4 - best.length(), ' ') << best
+            << std::string(title.length() - 33, ' ') << "|" << std::endl;
+  std::cout << " |\\_/|  |" << std::string(title.length() + 2, '_') << "|"
+            << std::endl;
+  std::cout << " \\___/" << std::endl;
+}
+}  // namespace
+
+void BackendBenchmark::Run() {
+  OptionsParser options;
+  SharedBackendParams::Populate(&options);
+  options.Add<IntOption>(kThreadsOptionId, 1, 128) = kDefaultThreads;
+
+  options.Add<IntOption>(kBatchesId, 1, 999999999) = 100;
+  options.Add<IntOption>(kStartBatchSizeId, 1, 1024) = 1;
+  options.Add<IntOption>(kMaxBatchSizeId, 1, 1024) = 256;
+  options.Add<IntOption>(kBatchStepId, 1, 256) = 1;
+  options.Add<BoolOption>(kHeaderOnlyOnceId) = false;
+  options.Add<StringOption>(kFenId) = ChessBoard::kStartposFen;
+  options.Add<BoolOption>(kClippyId) = false;
+
+  if (!options.ProcessAllFlags()) return;
+
+  try {
+    auto option_dict = options.GetOptionsDict();
+
+    auto backend = BackendManager::Get()->CreateFromParams(option_dict);
+    const int threads = option_dict.Get<int>(kThreadsOptionId);
+
+    classic::NodeTree tree;
+    tree.ResetToPosition(option_dict.Get<std::string>(kFenId), {});
+    EvalPosition pos{tree.GetPositionHistory().GetPositions(), {}};
+    std::vector<std::thread> handles;
+
+    // Do any backend initialization outside the loop.
+    auto warm = [&]() {
+      // Give GPU enough work to make it go from idle clocks to max clocks.
+      for (int i = 0; i < 2; i++) {
+        auto warmup = backend->CreateComputation();
+        for (int j = 0; j < option_dict.Get<int>(kMaxBatchSizeId); ++j) {
+          warmup->AddInput(pos, {});
+        }
+        warmup->ComputeBlocking();
+      }
+    };
+    for (int t = 1; t < threads; t++) {
+      handles.emplace_back(warm);
+    }
+    warm();
+    for (auto& handle : handles) {
+      handle.join();
+    }
+    handles.clear();
+
+    const int batches = option_dict.Get<int>(kBatchesId);
+
+    int best = 1;
+    int best2 = 1;
+    int best3 = 1;
+    float best_nps = 0.0f;
+    float best_nps2 = 0.0f;
+    float best_nps3 = 0.0f;
+    std::optional<std::chrono::time_point<std::chrono::steady_clock>> pending;
+    using tp = std::chrono::time_point<std::chrono::steady_clock>;
+    std::vector<std::vector<tp>> ends(threads);
+    for (auto& vend : ends) {
+      vend.resize(batches + 1);
+    }
+    std::vector<std::chrono::duration<double>> times(batches);
+    std::vector<int> thread_counts(threads);
+    for (int i = option_dict.Get<int>(kStartBatchSizeId);
+         i <= option_dict.Get<int>(kMaxBatchSizeId);
+         i += option_dict.Get<int>(kBatchStepId)) {
+      handles.reserve(threads);
+      std::atomic<int> j{0};
+
+      auto compute = [&](int tid = 0) {
+        int count = 0;
+        auto& end = ends[tid];
+        // Ignore the first batch to let GPU queue fill for stable measurements.
+        while (j++ < batches) {
+          // Put i copies of tree root node into computation and compute.
+          auto computation = backend->CreateComputation();
+          for (int k = 0; k < i; k++) {
+            computation->AddInput(pos, {});
+          }
+          computation->ComputeBlocking();
+          end[count++] = std::chrono::steady_clock::now();
+        }
+        thread_counts[tid] = count;
+      };
+
+      for (int t = 1; t < threads; t++) {
+        handles.emplace_back(compute, t);
+      }
+
+      compute(0);
+      for (auto& handle : handles) {
+        handle.join();
+      }
+
+      handles.clear();
+
+      double stddev = 0;
+      double total = 0;
+      int batches_done = 0;
+      for (int t = 0; t < threads; t++) {
+        for (int j = 1; j < thread_counts[t]; j++) {
+          times[batches_done] = (ends[t][j] - ends[t][j - 1]) / threads;
+          total += times[batches_done].count();
+          batches_done++;
+        }
+      }
+
+      double mean = total / batches_done;
+
+      for (int j = 0; j < batches_done; j++) {
+        double diff = times[j].count() - mean;
+        stddev += diff * diff;
+      }
+      stddev = std::sqrt(stddev / (batches_done - 1));
+      double cv = stddev / mean;
+
+      std::sort(times.begin(), times.begin() + batches_done);
+
+      mean *= 1000;
+
+      const auto nps = i * batches_done / total;
+      const auto median = batches_done % 2 == 0
+                              ? 2 * i /
+                                    (times[batches_done / 2 - 1].count() +
+                                     times[batches_done / 2].count())
+                              : i / times[batches_done / 2].count();
+      if (option_dict.Get<bool>(kHeaderOnlyOnceId)
+              ? i == option_dict.Get<int>(kStartBatchSizeId)
+              : ((i - option_dict.Get<int>(kStartBatchSizeId)) /
+                     option_dict.Get<int>(kBatchStepId) % 32 ==
+                 0)) {
+        std::cout << "size,"
+                     " mean nps,"
+                     " mean ms,"
+                     "   sdev,"
+                     "     cv,"
+                     " max nps,"
+                     "  median,"
+                     " min nps,"
+                  << std::endl;
+      }
+      // clang-format off
+      std::cout << std::setw(4) << i << ","
+                << std::fixed << std::setprecision(0)
+                << std::setw(9) << nps << ","
+                << std::defaultfloat << std::setprecision(4)
+                << std::setw(8) << mean  << ","
+                << std::fixed << std::setprecision(4)
+                << std::setw(7) << stddev * 1000 << ","
+                << std::setw(7) << cv << ","
+                << std::fixed << std::setprecision(0)
+                << std::setw(8) << i / times[0].count() << ","
+                << std::setw(8) << median << ","
+                << std::setw(8) << i / times[batches_done - 1].count()
+                << std::endl;
+      // clang-format on
+
+      if (option_dict.Get<bool>(kClippyId)) {
+        float nps_ingame = std::pow((nps + best_nps) / 2, 1.085);
+        float nps_ingame2 = std::pow((nps + best_nps2) / 2, 1.085);
+        float nps_ingame3 = std::pow((nps + best_nps3) / 2, 1.085);
+        float threshold = 0.16947 * exp(-4.1695e-6 * nps_ingame * 180) + 0.02;
+        float threshold2 = 0.16947 * exp(-4.1695e-6 * nps_ingame2 * 15) + 0.02;
+        float threshold3 = 0.16947 * exp(-4.1695e-6 * nps_ingame3 * 1) + 0.02;
+
+        if (nps > best_nps &&
+            threshold * (i - best) * best_nps < (nps - best_nps) * best) {
+          best_nps = nps;
+          best = i;
+          if (threshold2 * (i - best2) * best_nps2 <
+              (nps - best_nps2) * best2) {
+            best_nps2 = nps;
+            best2 = i;
+            if (threshold3 * (i - best3) * best_nps3 <
+                (nps - best_nps3) * best3) {
+              best_nps3 = nps;
+              best3 = i;
+            }
+          }
+          if (!pending) {
+            pending = std::chrono::steady_clock::now();
+          }
+        }
+        if (pending) {
+          std::chrono::duration<double> time =
+              std::chrono::steady_clock::now() - *pending;
+          if (time.count() > 10) {
+            Clippy("Recommended minibatch-size for this net (so far):",
+                   "1s/move   (Bullet):     ", std::to_string(best3),
+                   "15s/move  (Rapid):      ", std::to_string(best2),
+                   "3min/move (Tournament): ", std::to_string(best));
+            pending.reset();
+          }
+        }
+      }
+    }
+    if (option_dict.Get<bool>(kClippyId)) {
+      Clippy("Recommended minibatch-size for this net:",
+             "1s/move   (Bullet):     ", std::to_string(best3),
+             "15s/move  (Rapid):      ", std::to_string(best2),
+             "3min/move (Tournament): ", std::to_string(best));
+    }
+  } catch (Exception& ex) {
+    std::cerr << ex.what() << std::endl;
+  }
+}
+}  // namespace lczero
diff --git a/src/benchmark/backendbench.h b/src/tools/backendbench.h
similarity index 100%
rename from src/benchmark/backendbench.h
rename to src/tools/backendbench.h
diff --git a/src/benchmark/benchmark.cc b/src/tools/benchmark.cc
similarity index 72%
rename from src/benchmark/benchmark.cc
rename to src/tools/benchmark.cc
index d24638a002..c243dfe237 100644
--- a/src/benchmark/benchmark.cc
+++ b/src/tools/benchmark.cc
@@ -25,13 +25,16 @@
   Program grant you additional permission to convey the resulting work.
 */
 
-#include "benchmark/benchmark.h"
+#include "tools/benchmark.h"
 
 #include <numeric>
 
-#include "mcts/search.h"
-#include "mcts/stoppers/factory.h"
-#include "mcts/stoppers/stoppers.h"
+#include "neural/memcache.h"
+#include "neural/shared_params.h"
+#include "search/classic/search.h"
+#include "search/classic/stoppers/factory.h"
+#include "search/classic/stoppers/stoppers.h"
+#include "utils/string.h"
 
 namespace lczero {
 namespace {
@@ -47,24 +50,31 @@ const OptionId kNumPositionsId{"num-positions", "",
                                "The number of benchmark positions to test."};
 }  // namespace
 
-void Benchmark::Run() {
+void Benchmark::Run(bool run_shorter_benchmark) {
   OptionsParser options;
-  NetworkFactory::PopulateOptions(&options);
+  SharedBackendParams::Populate(&options);
   options.Add<IntOption>(kThreadsOptionId, 1, 128) = kDefaultThreads;
-  options.Add<IntOption>(kNNCacheSizeId, 0, 999999999) = 200000;
-  SearchParams::Populate(&options);
+  options.GetMutableDefaultsOptions()->Set(SharedBackendParams::kNNCacheSizeId,
+                                           200000);
+  classic::SearchParams::Populate(&options);
 
   options.Add<IntOption>(kNodesId, -1, 999999999) = -1;
-  options.Add<IntOption>(kMovetimeId, -1, 999999999) = 10000;
   options.Add<StringOption>(kFenId) = "";
-  options.Add<IntOption>(kNumPositionsId, 1, 34) = 34;
+  if (run_shorter_benchmark) {
+    options.Add<IntOption>(kMovetimeId, -1, 999999999) = 500;
+    options.Add<IntOption>(kNumPositionsId, 1, 34) = 10;
+  } else {
+    options.Add<IntOption>(kMovetimeId, -1, 999999999) = 10000;
+    options.Add<IntOption>(kNumPositionsId, 1, 34) = 34;
+  }
 
   if (!options.ProcessAllFlags()) return;
 
   try {
     auto option_dict = options.GetOptionsDict();
 
-    auto network = NetworkFactory::LoadNetwork(option_dict);
+    auto backend = CreateMemCache(
+        BackendManager::Get()->CreateFromParams(option_dict), option_dict);
 
     const int visits = option_dict.Get<int>(kNodesId);
     const int movetime = option_dict.Get<int>(kMovetimeId);
@@ -86,28 +96,32 @@ void Benchmark::Run() {
       std::cout << "\nPosition: " << cnt++ << "/" << testing_positions.size()
                 << " " << position << std::endl;
 
-      auto stopper = std::make_unique<ChainedSearchStopper>();
+      auto stopper = std::make_unique<classic::ChainedSearchStopper>();
       if (movetime > -1) {
-        stopper->AddStopper(std::make_unique<TimeLimitStopper>(movetime));
+        stopper->AddStopper(
+            std::make_unique<classic::TimeLimitStopper>(movetime));
       }
       if (visits > -1) {
-        stopper->AddStopper(std::make_unique<VisitsStopper>(visits, false));
+        stopper->AddStopper(
+            std::make_unique<classic::VisitsStopper>(visits, false));
       }
 
-      NNCache cache;
-      cache.SetCapacity(option_dict.Get<int>(kNNCacheSizeId));
-
-      NodeTree tree;
-      tree.ResetToPosition(position, {});
+      classic::NodeTree tree;
+      std::vector<std::string> moves;
+      if (auto iter = position.find("moves "); iter != std::string::npos) {
+        moves = StrSplitAtWhitespace(position.substr(iter + 6));
+        position = position.substr(0, iter);
+      }
+      tree.ResetToPosition(position, moves);
 
       const auto start = std::chrono::steady_clock::now();
-      auto search = std::make_unique<Search>(
-          tree, network.get(),
+      auto search = std::make_unique<classic::Search>(
+          tree, backend.get(),
           std::make_unique<CallbackUciResponder>(
               std::bind(&Benchmark::OnBestMove, this, std::placeholders::_1),
               std::bind(&Benchmark::OnInfo, this, std::placeholders::_1)),
           MoveList(), start, std::move(stopper), false, false, option_dict,
-          &cache, nullptr);
+          nullptr);
       search->StartThreads(option_dict.Get<int>(kThreadsOptionId));
       search->Wait();
       const auto end = std::chrono::steady_clock::now();
@@ -133,14 +147,14 @@ void Benchmark::Run() {
 }
 
 void Benchmark::OnBestMove(const BestMoveInfo& move) {
-  std::cout << "bestmove " << move.bestmove.as_string() << std::endl;
+  std::cout << "bestmove " << move.bestmove.ToString(true) << std::endl;
 }
 
 void Benchmark::OnInfo(const std::vector<ThinkingInfo>& infos) {
   std::string line = "Benchmark time " + std::to_string(infos[0].time);
   line += " ms, " + std::to_string(infos[0].nodes) + " nodes, ";
   line += std::to_string(infos[0].nps) + " nps";
-  if (!infos[0].pv.empty()) line += ", move " + infos[0].pv[0].as_string();
+  if (!infos[0].pv.empty()) line += ", move " + infos[0].pv[0].ToString(true);
   std::cout << line << std::endl;
 }
 
diff --git a/src/benchmark/benchmark.h b/src/tools/benchmark.h
similarity index 97%
rename from src/benchmark/benchmark.h
rename to src/tools/benchmark.h
index a081a76883..916600fb3d 100644
--- a/src/benchmark/benchmark.h
+++ b/src/tools/benchmark.h
@@ -27,9 +27,8 @@
 
 #pragma once
 
-#include "mcts/search.h"
-#include "neural/cache.h"
-#include "neural/factory.h"
+#include "search/classic/search.h"
+#include "neural/register.h"
 #include "utils/optionsparser.h"
 
 namespace lczero {
@@ -78,7 +77,7 @@ class Benchmark{
       "3Qb1k1/1r2ppb1/pN1n2q1/Pp1Pp1Pr/4P2p/4BP2/4B1R1/1R5K b - - 11 40"
   };
 
-  void Run();
+  void Run(bool run_shorter_benchmark = false);
   void OnBestMove(const BestMoveInfo& move);
   void OnInfo(const std::vector<ThinkingInfo>& infos);
 };
diff --git a/src/lc0ctl/describenet.cc b/src/tools/describenet.cc
similarity index 97%
rename from src/lc0ctl/describenet.cc
rename to src/tools/describenet.cc
index 634589f66b..679c2e51cb 100644
--- a/src/lc0ctl/describenet.cc
+++ b/src/tools/describenet.cc
@@ -25,10 +25,10 @@
   Program grant you additional permission to convey the resulting work.
 */
 
-#include "lc0ctl/describenet.h"
+#include "tools/describenet.h"
 
 #include "neural/loader.h"
-#include "neural/onnx/onnx.pb.h"
+#include "proto/onnx.pb.h"
 #include "utils/optionsparser.h"
 
 namespace lczero {
@@ -110,6 +110,11 @@ void ShowNetworkFormatInfo(const pblczero::Net& weights) {
     COUT << Justify("FFN activation")
          << NetworkFormat::ActivationFunction_Name(net_format.ffn_activation());
   }
+  if (net_format.has_input_embedding()) {
+    COUT << Justify("Input embedding")
+         << NetworkFormat::InputEmbeddingFormat_Name(
+                net_format.input_embedding());
+  }
 }
 
 void ShowNetworkTrainingInfo(const pblczero::Net& weights) {
diff --git a/src/lc0ctl/describenet.h b/src/tools/describenet.h
similarity index 100%
rename from src/lc0ctl/describenet.h
rename to src/tools/describenet.h
diff --git a/src/lc0ctl/leela2onnx.cc b/src/tools/leela2onnx.cc
similarity index 88%
rename from src/lc0ctl/leela2onnx.cc
rename to src/tools/leela2onnx.cc
index 8281c6967e..fc7cb9db40 100644
--- a/src/lc0ctl/leela2onnx.cc
+++ b/src/tools/leela2onnx.cc
@@ -28,11 +28,11 @@
 #include <fstream>
 #include <iostream>
 
-#include "lc0ctl/describenet.h"
 #include "neural/loader.h"
 #include "neural/onnx/converter.h"
 #include "neural/xla/onnx2hlo.h"
 #include "neural/xla/print_hlo.h"
+#include "tools/describenet.h"
 #include "utils/files.h"
 #include "utils/optionsparser.h"
 
@@ -46,18 +46,28 @@ const OptionId kHloTextOutputFilenameId = {"hlo-text-output", "",
                                            "Path of the output HLO file."};
 const OptionId kHloProtoOutputFilenameId = {
     "hlo-proto-output", "", "Path of the output HLO proto file."};
-const OptionId kOnnxBatchSizeId{"onnx-batch-size", "",
-                                "Batch size to use for ONNX conversion."};
+const OptionId kOnnxBatchSizeId{
+    {.long_flag = "onnx-batch-size",
+     .uci_option = "",
+     .help_text = "Batch size to use for ONNX conversion.",
+     .visibility = OptionId::kProOnly}};
 const OptionId kHloBatchSizeId{"hlo-batch-size", "",
                                "Batch size to use for HLO conversion."};
 const OptionId kOnnxDataTypeId{"onnx-data-type", "",
                                "Data type to use in the ONNX model."};
 const OptionId kOnnxOpsetId{"onnx-opset", "",
                             "Opset to use in the ONNX model."};
-const OptionId kHloAllowPartialResultId = {
-    "hlo-allow-partial-result", "",
-    "Allow partial result in case of HLO conversion failure (DEBUG ONLY!)."};
-
+const OptionId kOnnxIrId{
+    {.long_flag = "onnx-ir",
+     .uci_option = "",
+     .help_text = "IR to use for the ONNX model.",
+     .visibility = OptionId::kProOnly}};
+const OptionId kHloAllowPartialResultId{
+    {.long_flag = "hlo-allow-partial-result",
+     .uci_option = "",
+     .help_text = "Allow partial result in case of HLO conversion failure "
+                  "(DEBUG ONLY!).",
+     .visibility = OptionId::kProOnly}};
 const OptionId kInputPlanesName{"input-planes-name", "",
                                 "ONNX name to use for the input planes node."};
 const OptionId kOutputPolicyHead{
@@ -77,11 +87,10 @@ const OptionId kValueHead{
     "value-head", "",
     "Value head to be used in the generated model. Typical values are "
     "'winner', 'q' or 'st', but only 'winner' is always available."};
-const OptionId kPolicyHead{
-    "policy-head", "",
-    "Policy head to be used in the generated model. Typical values are "
-    "'vanilla', 'optimistic' or 'soft', but only 'vanilla' is always "
-    "available."};
+const OptionId kPolicyHead{"policy-head", "",
+                           "Policy head to be used in the generated model. "
+                           "Typical values are 'vanilla', 'optimistic' or "
+                           "'soft', but only 'vanilla' is always available."};
 
 bool ProcessParameters(OptionsParser* options) {
   options->Add<StringOption>(kInputFilenameId);
@@ -90,12 +99,11 @@ bool ProcessParameters(OptionsParser* options) {
   options->Add<StringOption>(kHloProtoOutputFilenameId);
   options->Add<IntOption>(kOnnxBatchSizeId, -1, 2048) = -1;
   options->Add<IntOption>(kOnnxOpsetId, 7, 18) = 17;
+  options->Add<IntOption>(kOnnxIrId, -1, 10) = -1;
   options->Add<IntOption>(kHloBatchSizeId, 1, 2048) = 333;
   options->Add<ChoiceOption>(
       kOnnxDataTypeId, std::vector<std::string>{"f32", "f16", "bf16"}) = "f32";
   options->Add<BoolOption>(kHloAllowPartialResultId);
-  options->HideOption(kOnnxBatchSizeId);
-  options->HideOption(kHloAllowPartialResultId);
 
   options->Add<StringOption>(kInputPlanesName) = "/input/planes";
   options->Add<StringOption>(kOutputPolicyHead) = "/output/policy";
@@ -141,6 +149,7 @@ void ConvertLeelaToOnnx() {
     onnx_options.output_wdl = dict.Get<std::string>(kOutputWdl);
     onnx_options.output_value = dict.Get<std::string>(kOutputValue);
     onnx_options.opset = dict.Get<int>(kOnnxOpsetId);
+    onnx_options.ir = dict.Get<int>(kOnnxIrId);
     onnx_options.batch_size = dict.Get<int>(kOnnxBatchSizeId);
     onnx_options.data_type = WeightsToOnnxConverterOptions::StringToDataType(
         dict.Get<std::string>(kOnnxDataTypeId));
diff --git a/src/lc0ctl/leela2onnx.h b/src/tools/leela2onnx.h
similarity index 100%
rename from src/lc0ctl/leela2onnx.h
rename to src/tools/leela2onnx.h
diff --git a/src/lc0ctl/onnx2leela.cc b/src/tools/onnx2leela.cc
similarity index 98%
rename from src/lc0ctl/onnx2leela.cc
rename to src/tools/onnx2leela.cc
index d036138251..ff90a6e518 100644
--- a/src/lc0ctl/onnx2leela.cc
+++ b/src/tools/onnx2leela.cc
@@ -31,9 +31,9 @@
 #include <fstream>
 #include <set>
 
-#include "lc0ctl/describenet.h"
-#include "neural/onnx/onnx.pb.h"
 #include "proto/net.pb.h"
+#include "proto/onnx.pb.h"
+#include "tools/describenet.h"
 #include "utils/files.h"
 #include "utils/fp16_utils.h"
 #include "utils/optionsparser.h"
@@ -61,9 +61,9 @@ T GetEnumValueFromString(const std::string& str_value,
 }
 
 const OptionId kInputFilenameId{"input", "InputFile",
-                                "Path of the input Lc0 weights file."};
+                                "Path of the input ONNX file."};
 const OptionId kOutputFilenameId{"output", "OutputFile",
-                                 "Path of the output ONNX file."};
+                                 "Path of the output Lc0 weights file."};
 
 const OptionId kInputFormatId(
     "input-format", "InputFormat",
@@ -167,7 +167,7 @@ bool ValidateNetwork(const pblczero::Net& weights, pblczero::ModelProto& onnx) {
 
   auto check_exists = [](std::string_view n, std::set<std::string>* nodes) {
     std::string name(n);
-    if (nodes->count(name) == 0) {
+    if (!nodes->contains(name)) {
       CERR << "Node '" << name << "' doesn't exist in ONNX.";
       return false;
     }
@@ -465,9 +465,11 @@ void ConvertOnnxToLeela() {
       NetworkFormat::ValueFormat_AllValues, NetworkFormat::ValueFormat_Name));
   if (dict.OwnExists<std::string>(kOnnxOutputValueId)) {
     onnx->set_output_value(dict.Get<std::string>(kOnnxOutputValueId));
+    format->set_output(NetworkFormat::OUTPUT_CLASSICAL);
   }
   if (dict.OwnExists<std::string>(kOnnxOutputWdlId)) {
     onnx->set_output_wdl(dict.Get<std::string>(kOnnxOutputWdlId));
+    format->set_output(NetworkFormat::OUTPUT_WDL);
   }
 
   // Mlh.
diff --git a/src/lc0ctl/onnx2leela.h b/src/tools/onnx2leela.h
similarity index 100%
rename from src/lc0ctl/onnx2leela.h
rename to src/tools/onnx2leela.h
diff --git a/src/trainingdata/rescorer.cc b/src/trainingdata/rescorer.cc
new file mode 100644
index 0000000000..a2a4148d28
--- /dev/null
+++ b/src/trainingdata/rescorer.cc
@@ -0,0 +1,1470 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2018-2024 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include "trainingdata/rescorer.h"
+
+#include <algorithm>
+#include <optional>
+#include <source_location>
+#include <span>
+#include <sstream>
+
+#include "gtb-probe.h"
+#include "neural/decoder.h"
+#include "syzygy/syzygy.h"
+#include "trainingdata/reader.h"
+#include "utils/filesystem.h"
+#include "utils/optionsparser.h"
+
+namespace lczero {
+
+namespace {
+const OptionId kSyzygyTablebaseId{"syzygy-paths", "",
+                                  "List of Syzygy tablebase directories"};
+const OptionId kGaviotaTablebaseId{"gaviotatb-paths", "",
+                                   "List of Gaviota tablebase directories"};
+const OptionId kInputDirId{
+    "input", "", "Directory with gzipped files in need of rescoring."};
+const OptionId kPolicySubsDirId{"policy-substitutions", "",
+                                "Directory with gzipped files are to use to "
+                                "replace policy for some of the data."};
+const OptionId kOutputDirId{"output", "", "Directory to write rescored files."};
+const OptionId kThreadsId{"threads", "",
+                          "Number of concurrent threads to rescore with.", 't'};
+const OptionId kTempId{"temperature", "",
+                       "Additional temperature to apply to policy target."};
+const OptionId kDistributionOffsetId{
+    "dist_offset", "",
+    "Additional offset to apply to policy target before temperature."};
+const OptionId kMinDTZBoostId{
+    "dtz_policy_boost", "",
+    "Additional offset to apply to policy target before temperature for moves "
+    "that are best dtz option."};
+const OptionId kNewInputFormatId{
+    "new-input-format", "",
+    "Input format to convert training data to during rescoring."};
+const OptionId kDeblunder{
+    "deblunder", "",
+    "If true, whether to use move Q information to infer a different Z value "
+    "if the the selected move appears to be a blunder."};
+const OptionId kDeblunderQBlunderThreshold{
+    "deblunder-q-blunder-threshold", "",
+    "The amount Q of played move needs to be worse than best move in order to "
+    "assume the played move is a blunder."};
+const OptionId kDeblunderQBlunderWidth{
+    "deblunder-q-blunder-width", "",
+    "Width of the transition between accepted temp moves and blunders."};
+const OptionId kNnuePlainFileId{"nnue-plain-file", "",
+                                "Append SF plain format training data to this "
+                                "file. Will be generated if not there."};
+const OptionId kNnueBestScoreId{"nnue-best-score", "",
+                                "For the SF training data use the score of the "
+                                "best move instead of the played one."};
+const OptionId kNnueBestMoveId{
+    "nnue-best-move", "",
+    "For the SF training data record the best move instead of the played one. "
+    "If set to true the generated files do not compress well."};
+const OptionId kDeleteFilesId{"delete-files", "",
+                              "Delete the input files after processing."};
+
+class PolicySubNode {
+ public:
+  PolicySubNode() {
+    for (int i = 0; i < 1858; i++) children[i] = nullptr;
+  }
+  bool active = false;
+  float policy[1858];
+  PolicySubNode* children[1858];
+};
+
+std::atomic<int> games(0);
+std::atomic<int> positions(0);
+std::atomic<int> rescored(0);
+std::atomic<int> delta(0);
+std::atomic<int> rescored2(0);
+std::atomic<int> rescored3(0);
+std::atomic<int> blunders(0);
+std::atomic<int> orig_counts[3];
+std::atomic<int> fixed_counts[3];
+std::atomic<int> policy_bump(0);
+std::atomic<int> policy_nobump_total_hist[11];
+std::atomic<int> policy_bump_total_hist[11];
+std::atomic<int> policy_dtm_bump(0);
+std::atomic<int> gaviota_dtm_rescores(0);
+std::map<uint64_t, PolicySubNode> policy_subs;
+bool gaviotaEnabled = false;
+bool deblunderEnabled = false;
+float deblunderQBlunderThreshold = 2.0f;
+float deblunderQBlunderWidth = 0.0f;
+
+void DataAssert(bool check_result,
+                std::source_location loc = std::source_location::current()) {
+  if (!check_result) {
+    throw Exception(std::string("Range Violation at ") + loc.file_name() + ":" +
+                    std::to_string(loc.line()));
+  }
+}
+
+void Validate(std::span<const V6TrainingData> fileContents) {
+  if (fileContents.empty()) throw Exception("Empty File");
+
+  for (size_t i = 0; i < fileContents.size(); i++) {
+    auto& data = fileContents[i];
+    DataAssert(
+        data.input_format ==
+            pblczero::NetworkFormat::INPUT_CLASSICAL_112_PLANE ||
+        data.input_format ==
+            pblczero::NetworkFormat::INPUT_112_WITH_CASTLING_PLANE ||
+        data.input_format ==
+            pblczero::NetworkFormat::INPUT_112_WITH_CANONICALIZATION ||
+        data.input_format == pblczero::NetworkFormat::
+                                 INPUT_112_WITH_CANONICALIZATION_HECTOPLIES ||
+        data.input_format ==
+            pblczero::NetworkFormat::
+                INPUT_112_WITH_CANONICALIZATION_HECTOPLIES_ARMAGEDDON ||
+        data.input_format ==
+            pblczero::NetworkFormat::INPUT_112_WITH_CANONICALIZATION_V2 ||
+        data.input_format == pblczero::NetworkFormat::
+                                 INPUT_112_WITH_CANONICALIZATION_V2_ARMAGEDDON);
+    DataAssert(data.best_d >= 0.0f && data.best_d <= 1.0f);
+    DataAssert(data.root_d >= 0.0f && data.root_d <= 1.0f);
+    DataAssert(data.best_q >= -1.0f && data.best_q <= 1.0f);
+    DataAssert(data.root_q >= -1.0f && data.root_q <= 1.0f);
+    DataAssert(data.root_m >= 0.0f);
+    DataAssert(data.best_m >= 0.0f);
+    DataAssert(data.plies_left >= 0.0f);
+    switch (data.input_format) {
+      case pblczero::NetworkFormat::INPUT_CLASSICAL_112_PLANE:
+        DataAssert(data.castling_them_oo <= 1);
+        DataAssert(data.castling_them_ooo <= 1);
+        DataAssert(data.castling_us_oo <= 1);
+        DataAssert(data.castling_us_ooo <= 1);
+        break;
+      default:
+        // Verifiy at most one bit set.
+        DataAssert((data.castling_them_oo & (data.castling_them_oo - 1)) == 0);
+        DataAssert((data.castling_them_ooo & (data.castling_them_ooo - 1)) ==
+                   0);
+        DataAssert((data.castling_us_oo & (data.castling_us_oo - 1)) == 0);
+        DataAssert((data.castling_us_ooo & (data.castling_us_ooo - 1)) == 0);
+    }
+    if (IsCanonicalFormat(static_cast<pblczero::NetworkFormat::InputFormat>(
+            data.input_format))) {
+      // At most one en-passant bit.
+      DataAssert((data.side_to_move_or_enpassant &
+                  (data.side_to_move_or_enpassant - 1)) == 0);
+    } else {
+      DataAssert(data.side_to_move_or_enpassant <= 1);
+    }
+    DataAssert(data.result_q >= -1 && data.result_q <= 1);
+    DataAssert(data.result_d >= 0 && data.result_q <= 1);
+    DataAssert(data.rule50_count <= 100);
+    float sum = 0.0f;
+    for (size_t j = 0; j < sizeof(data.probabilities) / sizeof(float); j++) {
+      float prob = data.probabilities[j];
+      DataAssert((prob >= 0.0f && prob <= 1.0f) || prob == -1.0f ||
+                 std::isnan(prob));
+      if (prob >= 0.0f) {
+        sum += prob;
+      }
+      // Only check best_idx/played_idx for real v6 data.
+      if (data.visits > 0) {
+        // Best_idx and played_idx must be marked legal in probabilities.
+        if (j == data.best_idx || j == data.played_idx) {
+          DataAssert(prob >= 0.0f);
+        }
+      }
+    }
+    if (sum < 0.99f || sum > 1.01f) {
+      throw Exception("Probability sum error is huge!");
+    }
+    DataAssert(data.best_idx <= 1858);
+    DataAssert(data.played_idx <= 1858);
+    DataAssert(data.played_q >= -1.0f && data.played_q <= 1.0f);
+    DataAssert(data.played_d >= 0.0f && data.played_d <= 1.0f);
+    DataAssert(data.played_m >= 0.0f);
+    DataAssert(std::isnan(data.orig_q) ||
+               (data.orig_q >= -1.0f && data.orig_q <= 1.0f));
+    DataAssert(std::isnan(data.orig_d) ||
+               (data.orig_d >= 0.0f && data.orig_d <= 1.0f));
+    DataAssert(std::isnan(data.orig_m) || data.orig_m >= 0.0f);
+    // TODO: if visits > 0 - assert best_idx/played_idx are valid in
+    // probabilities.
+  }
+}
+
+void Validate(std::span<const V6TrainingData> fileContents,
+              const MoveList& moves) {
+  PositionHistory history;
+  int rule50ply;
+  int gameply;
+  ChessBoard board;
+  auto input_format = static_cast<pblczero::NetworkFormat::InputFormat>(
+      fileContents[0].input_format);
+  PopulateBoard(input_format, PlanesFromTrainingData(fileContents[0]), &board,
+                &rule50ply, &gameply);
+  history.Reset(board, rule50ply, gameply);
+  for (size_t i = 0; i < moves.size(); i++) {
+    int transform = TransformForPosition(input_format, history);
+    // If real v6 data, can confirm that played_idx matches the inferred move.
+    if (fileContents[i].visits > 0) {
+      if (fileContents[i].played_idx != MoveToNNIndex(moves[i], transform)) {
+        throw Exception("Move performed is not listed as played.");
+      }
+    }
+    // Move shouldn't be marked illegal unless there is 0 visits, which should
+    // only happen if invariance_info is marked with the placeholder bit.
+    if (!(fileContents[i].probabilities[MoveToNNIndex(moves[i], transform)] >=
+          0.0f) &&
+        (fileContents[i].invariance_info & 64) == 0) {
+      std::cerr << "Illegal move: " << moves[i].ToString(true) << std::endl;
+      throw Exception("Move performed is marked illegal in probabilities.");
+    }
+    auto legal = history.Last().GetBoard().GenerateLegalMoves();
+    if (std::find(legal.begin(), legal.end(), moves[i]) == legal.end()) {
+      std::cerr << "Illegal move: " << moves[i].ToString(true) << std::endl;
+      throw Exception("Move performed is an illegal move.");
+    }
+    history.Append(moves[i]);
+  }
+}
+
+void gaviota_tb_probe_hard(const Position& pos, unsigned int& info,
+                           unsigned int& dtm) {
+  unsigned int wsq[17];
+  unsigned int bsq[17];
+  unsigned char wpc[17];
+  unsigned char bpc[17];
+
+  auto stm = pos.IsBlackToMove() ? tb_BLACK_TO_MOVE : tb_WHITE_TO_MOVE;
+  ChessBoard board = pos.GetBoard();
+  if (pos.IsBlackToMove()) board.Mirror();
+  auto epsq = tb_NOSQUARE;
+  for (auto sq : board.en_passant()) {
+    // Our internal representation stores en_passant 2 rows away
+    // from the actual sq.
+    if (sq.rank().idx == 0) {
+      epsq = (TB_squares)(sq.as_idx() + 16);
+    } else {
+      epsq = (TB_squares)(sq.as_idx() - 16);
+    }
+  }
+  int idx = 0;
+  for (auto sq : (board.ours() & board.kings())) {
+    wsq[idx] = (TB_squares)sq.as_idx();
+    wpc[idx] = tb_KING;
+    idx++;
+  }
+  for (auto sq : (board.ours() & board.knights())) {
+    wsq[idx] = (TB_squares)sq.as_idx();
+    wpc[idx] = tb_KNIGHT;
+    idx++;
+  }
+  for (auto sq : (board.ours() & board.queens())) {
+    wsq[idx] = (TB_squares)sq.as_idx();
+    wpc[idx] = tb_QUEEN;
+    idx++;
+  }
+  for (auto sq : (board.ours() & board.rooks())) {
+    wsq[idx] = (TB_squares)sq.as_idx();
+    wpc[idx] = tb_ROOK;
+    idx++;
+  }
+  for (auto sq : (board.ours() & board.bishops())) {
+    wsq[idx] = (TB_squares)sq.as_idx();
+    wpc[idx] = tb_BISHOP;
+    idx++;
+  }
+  for (auto sq : (board.ours() & board.pawns())) {
+    wsq[idx] = (TB_squares)sq.as_idx();
+    wpc[idx] = tb_PAWN;
+    idx++;
+  }
+  wsq[idx] = tb_NOSQUARE;
+  wpc[idx] = tb_NOPIECE;
+
+  idx = 0;
+  for (auto sq : (board.theirs() & board.kings())) {
+    bsq[idx] = (TB_squares)sq.as_idx();
+    bpc[idx] = tb_KING;
+    idx++;
+  }
+  for (auto sq : (board.theirs() & board.knights())) {
+    bsq[idx] = (TB_squares)sq.as_idx();
+    bpc[idx] = tb_KNIGHT;
+    idx++;
+  }
+  for (auto sq : (board.theirs() & board.queens())) {
+    bsq[idx] = (TB_squares)sq.as_idx();
+    bpc[idx] = tb_QUEEN;
+    idx++;
+  }
+  for (auto sq : (board.theirs() & board.rooks())) {
+    bsq[idx] = (TB_squares)sq.as_idx();
+    bpc[idx] = tb_ROOK;
+    idx++;
+  }
+  for (auto sq : (board.theirs() & board.bishops())) {
+    bsq[idx] = (TB_squares)sq.as_idx();
+    bpc[idx] = tb_BISHOP;
+    idx++;
+  }
+  for (auto sq : (board.theirs() & board.pawns())) {
+    bsq[idx] = (TB_squares)sq.as_idx();
+    bpc[idx] = tb_PAWN;
+    idx++;
+  }
+  bsq[idx] = tb_NOSQUARE;
+  bpc[idx] = tb_NOPIECE;
+
+  tb_probe_hard(stm, epsq, tb_NOCASTLE, wsq, bsq, wpc, bpc, &info, &dtm);
+}
+
+void ChangeInputFormat(int newInputFormat, V6TrainingData* data,
+                       const PositionHistory& history) {
+  data->input_format = newInputFormat;
+  auto input_format =
+      static_cast<pblczero::NetworkFormat::InputFormat>(newInputFormat);
+
+  // Populate planes.
+  int transform;
+  InputPlanes planes = EncodePositionForNN(input_format, history, 8,
+                                           FillEmptyHistory::NO, &transform);
+  int plane_idx = 0;
+  for (auto& plane : data->planes) {
+    plane = ReverseBitsInBytes(planes[plane_idx++].mask);
+  }
+
+  if ((data->invariance_info & 7) != transform) {
+    // Probabilities need reshuffling.
+    float newProbs[1858];
+    std::fill(std::begin(newProbs), std::end(newProbs), -1);
+    bool played_fixed = false;
+    bool best_fixed = false;
+    for (auto move : history.Last().GetBoard().GenerateLegalMoves()) {
+      int i = MoveToNNIndex(move, transform);
+      int j = MoveToNNIndex(move, data->invariance_info & 7);
+      newProbs[i] = data->probabilities[j];
+      // For V6 data only, the played/best idx need updating.
+      if (data->visits > 0) {
+        if (data->played_idx == j && !played_fixed) {
+          data->played_idx = i;
+          played_fixed = true;
+        }
+        if (data->best_idx == j && !best_fixed) {
+          data->best_idx = i;
+          best_fixed = true;
+        }
+      }
+    }
+    for (int i = 0; i < 1858; i++) {
+      data->probabilities[i] = newProbs[i];
+    }
+  }
+
+  const auto& position = history.Last();
+  const auto& castlings = position.GetBoard().castlings();
+  // Populate castlings.
+  // For non-frc trained nets, just send 1 like we used to.
+  uint8_t our_queen_side = 1;
+  uint8_t our_king_side = 1;
+  uint8_t their_queen_side = 1;
+  uint8_t their_king_side = 1;
+  // If frc trained, send the bit mask representing rook position.
+  if (Is960CastlingFormat(input_format)) {
+    our_queen_side <<= castlings.our_queenside_rook.idx;
+    our_king_side <<= castlings.our_kingside_rook.idx;
+    their_queen_side <<= castlings.their_queenside_rook.idx;
+    their_king_side <<= castlings.their_kingside_rook.idx;
+  }
+
+  data->castling_us_ooo = castlings.we_can_000() ? our_queen_side : 0;
+  data->castling_us_oo = castlings.we_can_00() ? our_king_side : 0;
+  data->castling_them_ooo = castlings.they_can_000() ? their_queen_side : 0;
+  data->castling_them_oo = castlings.they_can_00() ? their_king_side : 0;
+
+  // Save the bits that aren't connected to the input_format.
+  uint8_t invariance_mask = data->invariance_info & 0x78;
+  // Other params.
+  if (IsCanonicalFormat(input_format)) {
+    data->side_to_move_or_enpassant =
+        position.GetBoard().en_passant().as_int() >> 56;
+    if ((transform & FlipTransform) != 0) {
+      data->side_to_move_or_enpassant =
+          ReverseBitsInBytes(data->side_to_move_or_enpassant);
+    }
+    // Send transform in deprecated move count so rescorer can reverse it to
+    // calculate the actual move list from the input data.
+    data->invariance_info =
+        transform | (position.IsBlackToMove() ? (1u << 7) : 0u);
+  } else {
+    data->side_to_move_or_enpassant = position.IsBlackToMove() ? 1 : 0;
+    data->invariance_info = 0;
+  }
+  // Put the mask back.
+  data->invariance_info |= invariance_mask;
+}
+
+int ResultForData(const V6TrainingData& data) {
+  // Ensure we aren't reprocessing some data that has had custom adjustments to
+  // result training target applied.
+  DataAssert(data.result_q == -1.0f || data.result_q == 1.0f ||
+             data.result_q == 0.0f);
+  // Paranoia - ensure int cast never breaks the value.
+  DataAssert(data.result_q ==
+             static_cast<float>(static_cast<int>(data.result_q)));
+  return static_cast<int>(data.result_q);
+}
+
+std::string AsNnueString(const Position& p, Move m, float q, int result) {
+  std::ostringstream out;
+  out << "fen " << PositionToFen(p) << std::endl;
+  if (p.IsBlackToMove()) m.Flip();
+  out << "move " << m.ToString(false) << std::endl;
+  // Formula from PR1477 adjusted for SF PawnValueEg.
+  out << "score " << round(660.6 * q / (1 - 0.9751875 * std::pow(q, 10)))
+      << std::endl;
+  out << "ply " << p.GetGamePly() << std::endl;
+  out << "result " << result << std::endl;
+  out << "e" << std::endl;
+  return out.str();
+}
+
+struct ProcessFileFlags {
+  bool delete_files : 1;
+  bool nnue_best_score : 1;
+  bool nnue_best_move : 1;
+};
+
+struct FileData {
+  std::vector<V6TrainingData> fileContents;
+  MoveList moves;
+  pblczero::NetworkFormat::InputFormat input_format;
+};
+
+bool IsAllDraws(const FileData& data) {
+  for (const auto& chunk : data.fileContents) {
+    if (ResultForData(chunk) != 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+std::vector<V6TrainingData> ReadFile(const std::string& file) {
+  std::vector<V6TrainingData> fileContents;
+
+  TrainingDataReader reader(file);
+  V6TrainingData chunk;
+  while (reader.ReadChunk(&chunk)) {
+    fileContents.push_back(chunk);
+  }
+
+  return fileContents;
+}
+
+FileData ProcessAndValidateFileData(std::vector<V6TrainingData> fileContents) {
+  FileData data;
+  data.fileContents = std::move(fileContents);
+
+  Validate(data.fileContents);
+  games += 1;
+  positions += data.fileContents.size();
+  // Decode moves from input data
+  for (size_t i = 1; i < data.fileContents.size(); i++) {
+    data.moves.push_back(
+        DecodeMoveFromInput(PlanesFromTrainingData(data.fileContents[i]),
+                            PlanesFromTrainingData(data.fileContents[i - 1])));
+    // All moves decoded are from the point of view of the side after the
+    // move so need to mirror them all to be applicable to apply to the
+    // position before.
+    data.moves.back().Flip();
+  }
+  Validate(data.fileContents, data.moves);
+
+  data.input_format = static_cast<pblczero::NetworkFormat::InputFormat>(
+      data.fileContents[0].input_format);
+
+  return data;
+}
+
+void ApplyPolicySubstitutions(FileData& data) {
+  if (policy_subs.empty()) return;
+  PositionHistory history;
+  int rule50ply;
+  int gameply;
+  ChessBoard board;
+
+  PopulateBoard(data.input_format, PlanesFromTrainingData(data.fileContents[0]),
+                &board, &rule50ply, &gameply);
+  history.Reset(board, rule50ply, gameply);
+  uint64_t rootHash = HashCat(board.Hash(), rule50ply);
+
+  if (policy_subs.find(rootHash) != policy_subs.end()) {
+    PolicySubNode* rootNode = &policy_subs[rootHash];
+    for (size_t i = 0; i < data.fileContents.size(); i++) {
+      if (rootNode->active) {
+        for (int j = 0; j < 1858; j++) {
+          data.fileContents[i].probabilities[j] = rootNode->policy[j];
+        }
+      }
+      if (i + 1 < data.fileContents.size()) {
+        int transform = TransformForPosition(data.input_format, history);
+        int idx = MoveToNNIndex(data.moves[i], transform);
+        if (rootNode->children[idx] == nullptr) {
+          break;
+        }
+        rootNode = rootNode->children[idx];
+        history.Append(data.moves[i]);
+      }
+    }
+  }
+}
+
+void ApplySyzygyRescoring(FileData& data, SyzygyTablebase* tablebase) {
+  PositionHistory history;
+  int rule50ply;
+  int gameply;
+  ChessBoard board;
+
+  // First pass: rescoring positions with rule50ply == 0
+  PopulateBoard(data.input_format, PlanesFromTrainingData(data.fileContents[0]),
+                &board, &rule50ply, &gameply);
+  history.Reset(board, rule50ply, gameply);
+  int last_rescore = -1;
+  orig_counts[ResultForData(data.fileContents[0]) + 1]++;
+  fixed_counts[ResultForData(data.fileContents[0]) + 1]++;
+
+  for (int i = 0; i < static_cast<int>(data.moves.size()); i++) {
+    history.Append(data.moves[i]);
+    const auto& board = history.Last().GetBoard();
+    if (board.castlings().no_legal_castle() &&
+        history.Last().GetRule50Ply() == 0 &&
+        (board.ours() | board.theirs()).count() <=
+            tablebase->max_cardinality()) {
+      ProbeState state;
+      WDLScore wdl = tablebase->probe_wdl(history.Last(), &state);
+      // Only fail state means the WDL is wrong, probe_wdl may produce
+      // correct result with a stat other than OK.
+      if (state != FAIL) {
+        int8_t score_to_apply = 0;
+        if (wdl == WDL_WIN) {
+          score_to_apply = 1;
+        } else if (wdl == WDL_LOSS) {
+          score_to_apply = -1;
+        }
+        for (int j = i + 1; j > last_rescore; j--) {
+          if (ResultForData(data.fileContents[j]) != score_to_apply) {
+            if (j == i + 1 && last_rescore == -1) {
+              fixed_counts[ResultForData(data.fileContents[0]) + 1]--;
+              bool flip = (i % 2) == 0;
+              fixed_counts[(flip ? -score_to_apply : score_to_apply) + 1]++;
+            }
+            rescored += 1;
+            delta += abs(ResultForData(data.fileContents[j]) - score_to_apply);
+          }
+
+          if (score_to_apply == 0) {
+            data.fileContents[j].result_d = 1.0f;
+          } else {
+            data.fileContents[j].result_d = 0.0f;
+          }
+          data.fileContents[j].result_q = static_cast<float>(score_to_apply);
+          score_to_apply = -score_to_apply;
+        }
+        last_rescore = i + 1;
+      }
+    }
+  }
+
+  // Second pass: rescoring positions with rule50ply != 0
+  PopulateBoard(data.input_format, PlanesFromTrainingData(data.fileContents[0]),
+                &board, &rule50ply, &gameply);
+  history.Reset(board, rule50ply, gameply);
+
+  for (size_t i = 0; i < data.moves.size(); i++) {
+    history.Append(data.moves[i]);
+    const auto& board = history.Last().GetBoard();
+    if (board.castlings().no_legal_castle() &&
+        history.Last().GetRule50Ply() != 0 &&
+        (board.ours() | board.theirs()).count() <=
+            tablebase->max_cardinality()) {
+      ProbeState state;
+      WDLScore wdl = tablebase->probe_wdl(history.Last(), &state);
+      // Only fail state means the WDL is wrong, probe_wdl may produce
+      // correct result with a stat other than OK.
+      if (state != FAIL) {
+        int8_t score_to_apply = 0;
+        if (wdl == WDL_WIN) {
+          score_to_apply = 1;
+        } else if (wdl == WDL_LOSS) {
+          score_to_apply = -1;
+        }
+        // If the WDL result disagrees with the game outcome, make it a
+        // draw. WDL draw is always draw regardless of prior moves since
+        // zero, so that clearly works. Otherwise, the WDL result could be
+        // correct or draw, so best we can do is change scores that don't
+        // agree, to be a draw. If score was a draw this is a no-op, if it
+        // was opposite it becomes a draw.
+        int8_t new_score =
+            ResultForData(data.fileContents[i + 1]) != score_to_apply
+                ? 0
+                : ResultForData(data.fileContents[i + 1]);
+        bool dtz_rescored = false;
+        // if score is not already right, and the score to apply isn't 0,
+        // dtz can let us know its definitely correct.
+        if (ResultForData(data.fileContents[i + 1]) != score_to_apply &&
+            score_to_apply != 0) {
+          // Any repetitions in the history since last 50 ply makes it risky
+          // to assume dtz is still correct.
+          int steps = history.Last().GetRule50Ply();
+          bool no_reps = true;
+          for (int i = 0; i < steps; i++) {
+            // If game started from non-zero 50 move rule, this could
+            // underflow. Only safe option is to assume there were
+            // repetitions before this point.
+            if (history.GetLength() - i - 1 < 0) {
+              no_reps = false;
+              break;
+            }
+            if (history.GetPositionAt(history.GetLength() - i - 1)
+                    .GetRepetitions() != 0) {
+              no_reps = false;
+              break;
+            }
+          }
+          if (no_reps) {
+            int depth = tablebase->probe_dtz(history.Last(), &state);
+            if (state != FAIL) {
+              // This should be able to be <= 99 safely, but I've not
+              // convinced myself thats true.
+              if (steps + std::abs(depth) < 99) {
+                rescored3++;
+                new_score = score_to_apply;
+                dtz_rescored = true;
+              }
+            }
+          }
+        }
+
+        // If score is not already a draw, and its not obviously a draw,
+        // check if 50 move rule has advanced so far its obviously a draw.
+        // Obviously not needed if we've already proven with dtz that its a
+        // win/loss.
+        if (ResultForData(data.fileContents[i + 1]) != 0 &&
+            score_to_apply != 0 && !dtz_rescored) {
+          int depth = tablebase->probe_dtz(history.Last(), &state);
+          if (state != FAIL) {
+            int steps = history.Last().GetRule50Ply();
+            // This should be able to be >= 101 safely, but I've not
+            // convinced myself thats true.
+            if (steps + std::abs(depth) > 101) {
+              rescored3++;
+              new_score = 0;
+              dtz_rescored = true;
+            }
+          }
+        }
+        if (new_score != ResultForData(data.fileContents[i + 1])) {
+          rescored2 += 1;
+        }
+
+        if (new_score == 0) {
+          data.fileContents[i + 1].result_d = 1.0f;
+        } else {
+          data.fileContents[i + 1].result_d = 0.0f;
+        }
+        data.fileContents[i + 1].result_q = static_cast<float>(new_score);
+      }
+    }
+  }
+}
+
+void ApplyPolicyAdjustments(FileData& data, SyzygyTablebase* tablebase,
+                            float distTemp, float distOffset, float dtzBoost) {
+  if (distTemp == 1.0f && distOffset == 0.0f && dtzBoost == 0.0f) {
+    return;  // No adjustments needed
+  }
+
+  PositionHistory history;
+  int rule50ply;
+  int gameply;
+  ChessBoard board;
+
+  PopulateBoard(data.input_format, PlanesFromTrainingData(data.fileContents[0]),
+                &board, &rule50ply, &gameply);
+  history.Reset(board, rule50ply, gameply);
+  size_t move_index = 0;
+
+  for (auto& chunk : data.fileContents) {
+    const auto& board = history.Last().GetBoard();
+    std::vector<bool> boost_probs(1858, false);
+    int boost_count = 0;
+
+    if (dtzBoost != 0.0f && board.castlings().no_legal_castle() &&
+        (board.ours() | board.theirs()).count() <=
+            tablebase->max_cardinality()) {
+      MoveList to_boost;
+      MoveList maybe_boost;
+      tablebase->root_probe(history.Last(), true, true, &to_boost);
+      if (history.DidRepeatSinceLastZeroingMove()) {
+        maybe_boost = to_boost;
+      } else {
+        tablebase->root_probe(history.Last(), false, true, &maybe_boost);
+      }
+      // If there is only one move, dtm fixup is not helpful.
+      // This code assumes all gaviota 3-4-5 tbs are present, as checked
+      // at startup.
+      if (gaviotaEnabled && maybe_boost.size() > 1 &&
+          (board.ours() | board.theirs()).count() <= 5) {
+        std::vector<unsigned int> dtms;
+        dtms.resize(maybe_boost.size());
+        unsigned int mininum_dtm = 1000;
+        // Only safe moves being considered, boost the smallest dtm
+        // amongst them.
+        for (auto& move : maybe_boost) {
+          Position next_pos = Position(history.Last(), move);
+          unsigned int info;
+          unsigned int dtm;
+          gaviota_tb_probe_hard(next_pos, info, dtm);
+          dtms.push_back(dtm);
+          if (dtm < mininum_dtm) mininum_dtm = dtm;
+        }
+        if (mininum_dtm < 1000) {
+          to_boost.clear();
+          int dtm_idx = 0;
+          for (auto& move : maybe_boost) {
+            if (dtms[dtm_idx] == mininum_dtm) {
+              to_boost.push_back(move);
+            }
+            dtm_idx++;
+          }
+          policy_dtm_bump++;
+        }
+      }
+      int transform = TransformForPosition(data.input_format, history);
+      for (auto& move : to_boost) {
+        boost_probs[MoveToNNIndex(move, transform)] = true;
+      }
+      boost_count = to_boost.size();
+    }
+    float sum = 0.0;
+    int prob_index = 0;
+    float preboost_sum = 0.0f;
+    for (auto& prob : chunk.probabilities) {
+      float offset =
+          distOffset +
+          (boost_probs[prob_index] ? (dtzBoost / boost_count) : 0.0f);
+      if (dtzBoost != 0.0f && boost_probs[prob_index]) {
+        preboost_sum += prob;
+        if (prob < 0 || std::isnan(prob))
+          std::cerr << "Bump for move that is illegal????" << std::endl;
+        policy_bump++;
+      }
+      prob_index++;
+      if (prob < 0 || std::isnan(prob)) continue;
+      prob = std::max(0.0f, prob + offset);
+      prob = std::pow(prob, 1.0f / distTemp);
+      sum += prob;
+    }
+    prob_index = 0;
+    float boost_sum = 0.0f;
+    for (auto& prob : chunk.probabilities) {
+      if (dtzBoost != 0.0f && boost_probs[prob_index]) {
+        boost_sum += prob / sum;
+      }
+      prob_index++;
+      if (prob < 0 || std::isnan(prob)) continue;
+      prob /= sum;
+    }
+    if (boost_count > 0) {
+      policy_nobump_total_hist[(int)(preboost_sum * 10)]++;
+      policy_bump_total_hist[(int)(boost_sum * 10)]++;
+    }
+    if (move_index < data.moves.size()) {
+      history.Append(data.moves[move_index]);
+      move_index++;
+    }
+  }
+}
+
+void EstimateAndCorrectPliesLeft(FileData& data) {
+  // Make move_count field plies_left for moves left head.
+  int offset = 0;
+  for (auto& chunk : data.fileContents) {
+    // plies_left can't be 0 for real v5 data, so if it is 0 it must be a v4
+    // conversion, and we should populate it ourselves with a better
+    // starting estimate.
+    if (chunk.plies_left == 0.0f) {
+      chunk.plies_left = (int)(data.fileContents.size() - offset);
+    }
+    offset++;
+  }
+}
+
+void ApplyGaviotaCorrections(FileData& data) {
+  if (!gaviotaEnabled) return;
+
+  if (IsAllDraws(data)) return;
+
+  PositionHistory history;
+  int rule50ply;
+  int gameply;
+  ChessBoard board;
+
+  PopulateBoard(data.input_format, PlanesFromTrainingData(data.fileContents[0]),
+                &board, &rule50ply, &gameply);
+  history.Reset(board, rule50ply, gameply);
+  int last_rescore = 0;
+
+  for (size_t i = 0; i < data.moves.size(); i++) {
+    history.Append(data.moves[i]);
+    const auto& board = history.Last().GetBoard();
+
+    // Gaviota TBs don't have 50 move rule.
+    // Only consider positions that are not draw after rescoring.
+    if ((ResultForData(data.fileContents[i + 1]) != 0) &&
+        board.castlings().no_legal_castle() &&
+        (board.ours() | board.theirs()).count() <= 5) {
+      std::vector<int> dtms;
+      unsigned int info;
+      unsigned int dtm;
+      gaviota_tb_probe_hard(history.Last(), info, dtm);
+      if (info != tb_WMATE && info != tb_BMATE) {
+        // Not a win for either player.
+        continue;
+      }
+      int steps = history.Last().GetRule50Ply();
+      if ((dtm + steps > 99) && (dtm <= data.fileContents[i + 1].plies_left)) {
+        // Following DTM could trigger 50 move rule and the current
+        // move_count is more than DTM.
+        // If DTM is more than the current move_count then we can rescore
+        // using it since DTM50 is not shorter than DTM.
+        continue;
+      }
+      bool no_reps = true;
+      for (int i = 0; i < steps; i++) {
+        // If game started from non-zero 50 move rule, this could
+        // underflow. Only safe option is to assume there were repetitions
+        // before this point.
+        if (history.GetLength() - i - 1 < 0) {
+          no_reps = false;
+          break;
+        }
+        if (history.GetPositionAt(history.GetLength() - i - 1)
+                .GetRepetitions() != 0) {
+          no_reps = false;
+          break;
+        }
+      }
+      if (!no_reps) {
+        // There were repetitions. Do nothing since DTM path
+        // could trigger draw by repetition.
+        continue;
+      }
+      gaviota_dtm_rescores++;
+      int j;
+      for (j = i; j >= -1; j--) {
+        if (j <= last_rescore) {
+          break;
+        }
+        data.fileContents[j + 1].plies_left = int(dtm + (i - j));
+      }
+      last_rescore = i;
+    }
+  }
+}
+
+void ApplyDTZCorrections(FileData& data, SyzygyTablebase* tablebase) {
+  // Correct move_count using DTZ for 3 piece no-pawn positions only.
+  // If Gaviota TBs are enabled no need to use syzygy.
+  if (gaviotaEnabled) return;
+
+  if (IsAllDraws(data)) return;
+
+  PositionHistory history;
+  int rule50ply;
+  int gameply;
+  ChessBoard board;
+
+  PopulateBoard(data.input_format, PlanesFromTrainingData(data.fileContents[0]),
+                &board, &rule50ply, &gameply);
+  history.Reset(board, rule50ply, gameply);
+
+  for (size_t i = 0; i < data.moves.size(); i++) {
+    history.Append(data.moves[i]);
+    const auto& board = history.Last().GetBoard();
+    if (board.castlings().no_legal_castle() &&
+        (board.ours() | board.theirs()).count() <= 3 && board.pawns().empty()) {
+      ProbeState state;
+      WDLScore wdl = tablebase->probe_wdl(history.Last(), &state);
+      // Only fail state means the WDL is wrong, probe_wdl may produce
+      // correct result with a stat other than OK.
+      if (state != FAIL) {
+        int8_t score_to_apply = 0;
+        if (wdl == WDL_WIN) {
+          score_to_apply = 1;
+        } else if (wdl == WDL_LOSS) {
+          score_to_apply = -1;
+        }
+        // No point updating for draws.
+        if (score_to_apply == 0) continue;
+        // Any repetitions in the history since last 50 ply makes it risky
+        // to assume dtz is still correct.
+        int steps = history.Last().GetRule50Ply();
+        bool no_reps = true;
+        for (int i = 0; i < steps; i++) {
+          // If game started from non-zero 50 move rule, this could
+          // underflow. Only safe option is to assume there were repetitions
+          // before this point.
+          if (history.GetLength() - i - 1 < 0) {
+            no_reps = false;
+            break;
+          }
+          if (history.GetPositionAt(history.GetLength() - i - 1)
+                  .GetRepetitions() != 0) {
+            no_reps = false;
+            break;
+          }
+        }
+        if (no_reps) {
+          int depth = tablebase->probe_dtz(history.Last(), &state);
+          if (state != FAIL) {
+            // if depth == -1 this is wrong, since that is mate and the
+            // answer should be 0, but the move before depth is -2. Since
+            // data never contains mate position, ignore that discrepency.
+            int converted_ply_remaining = std::abs(depth);
+            // This should be able to be <= 99 safely, but I've not
+            // convinced myself thats true.
+            if (steps + std::abs(depth) < 99) {
+              data.fileContents[i + 1].plies_left = converted_ply_remaining;
+            }
+            if (steps == 0) {
+              for (int j = i; j >= 0; j--) {
+                data.fileContents[j].plies_left =
+                    converted_ply_remaining + (i + 1 - j);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void ApplyDeblunder(FileData& data, SyzygyTablebase* tablebase) {
+  // Deblunder only works from v6 data onwards. We therefore check
+  // the visits field which is 0 if we're dealing with upgraded data.
+  if (!deblunderEnabled || data.fileContents.back().visits == 0) {
+    return;
+  }
+
+  PositionHistory history;
+  int rule50ply;
+  int gameply;
+  ChessBoard board;
+
+  PopulateBoard(data.input_format, PlanesFromTrainingData(data.fileContents[0]),
+                &board, &rule50ply, &gameply);
+  history.Reset(board, rule50ply, gameply);
+
+  for (size_t i = 0; i < data.moves.size(); i++) {
+    history.Append(data.moves[i]);
+    const auto& board = history.Last().GetBoard();
+    if (board.castlings().no_legal_castle() &&
+        (board.ours() | board.theirs()).count() <=
+            tablebase->max_cardinality()) {
+      history.Pop();
+      break;
+    }
+  }
+
+  float activeZ[3] = {data.fileContents.back().result_q,
+                      data.fileContents.back().result_d,
+                      data.fileContents.back().plies_left};
+  bool deblunderingStarted = false;
+
+  while (true) {
+    auto& cur = data.fileContents[history.GetLength() - 1];
+    // A blunder is defined by the played move being worse than the
+    // best move by a defined threshold, missing a forced win, or
+    // playing into a proven loss without being forced.
+    bool deblunderTriggerThreshold =
+        (cur.best_q - cur.played_q >
+         deblunderQBlunderThreshold - deblunderQBlunderWidth / 2.0);
+    bool deblunderTriggerTerminal =
+        (cur.best_q > -1 && cur.played_q < 1 &&
+         ((cur.best_q == 1 && ((cur.invariance_info & 8) != 0)) ||
+          cur.played_q == -1));
+    if (deblunderTriggerThreshold || deblunderTriggerTerminal) {
+      float newZRatio = 1.0f;
+      // If width > 0 and the deblunder didn't involve a terminal
+      // position, we apply a soft threshold by averaging old and new Z.
+      if (deblunderQBlunderWidth > 0 && !deblunderTriggerTerminal) {
+        newZRatio = std::min(
+            1.0f, (cur.best_q - cur.played_q - deblunderQBlunderThreshold) /
+                          deblunderQBlunderWidth +
+                      0.5f);
+      }
+      // Instead of averaging, a randomization can be applied here with
+      // newZRatio = newZRatio > rand( [0, 1) ) ? 1.0f : 0.0f;
+      activeZ[0] = (1 - newZRatio) * activeZ[0] + newZRatio * cur.best_q;
+      activeZ[1] = (1 - newZRatio) * activeZ[1] + newZRatio * cur.best_d;
+      activeZ[2] = (1 - newZRatio) * activeZ[2] + newZRatio * cur.best_m;
+      deblunderingStarted = true;
+      blunders += 1;
+    }
+    if (deblunderingStarted) {
+      data.fileContents[history.GetLength() - 1].result_q = activeZ[0];
+      data.fileContents[history.GetLength() - 1].result_d = activeZ[1];
+      data.fileContents[history.GetLength() - 1].plies_left = activeZ[2];
+    }
+    if (history.GetLength() == 1) break;
+    // Q values are always from the player to move.
+    activeZ[0] = -activeZ[0];
+    // Estimated remaining plies left has to be increased.
+    activeZ[2] += 1.0f;
+    history.Pop();
+  }
+}
+
+void ConvertInputFormat(FileData& data, int newInputFormat) {
+  if (newInputFormat == -1) return;
+
+  PositionHistory history;
+  int rule50ply;
+  int gameply;
+  ChessBoard board;
+
+  PopulateBoard(data.input_format, PlanesFromTrainingData(data.fileContents[0]),
+                &board, &rule50ply, &gameply);
+  history.Reset(board, rule50ply, gameply);
+  ChangeInputFormat(newInputFormat, &data.fileContents[0], history);
+
+  for (size_t i = 0; i < data.moves.size(); i++) {
+    history.Append(data.moves[i]);
+    ChangeInputFormat(newInputFormat, &data.fileContents[i + 1], history);
+  }
+}
+
+void WriteNnueOutput(const FileData& data, const std::string& nnue_plain_file,
+                     ProcessFileFlags flags) {
+  // Output data in Stockfish plain format.
+  if (!nnue_plain_file.empty()) {
+    static Mutex mutex;
+    std::ostringstream out;
+
+    PositionHistory history;
+    int rule50ply;
+    int gameply;
+    ChessBoard board;
+
+    PopulateBoard(data.input_format,
+                  PlanesFromTrainingData(data.fileContents[0]), &board,
+                  &rule50ply, &gameply);
+    history.Reset(board, rule50ply, gameply);
+
+    for (size_t i = 0; i < data.fileContents.size(); i++) {
+      auto chunk = data.fileContents[i];
+      Position p = history.Last();
+      if (chunk.visits > 0) {
+        // Format is v6 and position is evaluated.
+        Move m = MoveFromNNIndex(
+            flags.nnue_best_move ? chunk.best_idx : chunk.played_idx,
+            TransformForPosition(data.input_format, history));
+        float q = flags.nnue_best_score ? chunk.best_q : chunk.played_q;
+        out << AsNnueString(p, m, q, round(chunk.result_q));
+      } else if (i < data.moves.size()) {
+        out << AsNnueString(p, data.moves[i], chunk.best_q,
+                            round(chunk.result_q));
+      }
+      if (i < data.moves.size()) {
+        history.Append(data.moves[i]);
+      }
+    }
+    std::ofstream file;
+    Mutex::Lock lock(mutex);
+    file.open(nnue_plain_file, std::ios_base::app);
+    if (file.is_open()) {
+      file << out.str();
+      file.close();
+    }
+  }
+}
+
+void WriteOutputs(const FileData& data, const std::string& file,
+                  const std::string& outputDir) {
+  // Write processed training data
+  if (!outputDir.empty()) {
+    std::string fileName = file.substr(file.find_last_of("/\\") + 1);
+    TrainingDataWriter writer(outputDir + "/" + fileName);
+    for (const auto& chunk : data.fileContents) {
+      // Don't save chunks that just provide move history.
+      if ((chunk.invariance_info & 64) == 0) {
+        writer.WriteChunk(chunk);
+      }
+    }
+  }
+}
+
+FileData ProcessFileInternal(std::vector<V6TrainingData> fileContents,
+                             SyzygyTablebase* tablebase, float distTemp,
+                             float distOffset, float dtzBoost,
+                             int newInputFormat) {
+  // Process and validate file data
+  FileData data = ProcessAndValidateFileData(std::move(fileContents));
+
+  // Apply policy substitutions if available
+  ApplyPolicySubstitutions(data);
+
+  // Apply Syzygy tablebase rescoring
+  ApplySyzygyRescoring(data, tablebase);
+
+  // Apply policy adjustments (temperature, offset, boost)
+  ApplyPolicyAdjustments(data, tablebase, distTemp, distOffset, dtzBoost);
+
+  // Estimate and correct plies left
+  EstimateAndCorrectPliesLeft(data);
+
+  // Apply Gaviota tablebase corrections
+  ApplyGaviotaCorrections(data);
+
+  // Apply DTZ corrections
+  ApplyDTZCorrections(data, tablebase);
+
+  // Apply deblunder processing
+  ApplyDeblunder(data, tablebase);
+
+  // Convert input format if needed
+  ConvertInputFormat(data, newInputFormat);
+
+  return data;
+}
+
+void ProcessFile(const std::string& file, SyzygyTablebase* tablebase,
+                 std::string outputDir, float distTemp, float distOffset,
+                 float dtzBoost, int newInputFormat,
+                 std::string nnue_plain_file, ProcessFileFlags flags) {
+  try {
+    // Read file data
+    std::vector<V6TrainingData> fileContents = ReadFile(file);
+
+    FileData data =
+        ProcessFileInternal(std::move(fileContents), tablebase, distTemp,
+                            distOffset, dtzBoost, newInputFormat);
+
+    // Write NNUE output
+    WriteNnueOutput(data, nnue_plain_file, flags);
+
+    // Write outputs
+    WriteOutputs(data, file, outputDir);
+
+  } catch (Exception& ex) {
+    std::cerr << "While processing: " << file
+              << " - Exception thrown: " << ex.what() << std::endl;
+    if (flags.delete_files) {
+      std::cerr << "It will be deleted." << std::endl;
+    }
+  }
+  if (flags.delete_files) {
+    remove(file.c_str());
+  }
+}
+
+void ProcessFiles(const std::vector<std::string>& files,
+                  SyzygyTablebase* tablebase, std::string outputDir,
+                  float distTemp, float distOffset, float dtzBoost,
+                  int newInputFormat, int offset, int mod,
+                  std::string nnue_plain_file, ProcessFileFlags flags) {
+  std::cerr << "Thread: " << offset << " starting" << std::endl;
+  for (size_t i = offset; i < files.size(); i += mod) {
+    if (files[i].rfind(".gz") != files[i].size() - 3) {
+      std::cerr << "Skipping: " << files[i] << std::endl;
+      continue;
+    }
+    ProcessFile(files[i], tablebase, outputDir, distTemp, distOffset, dtzBoost,
+                newInputFormat, nnue_plain_file, flags);
+  }
+}
+
+void BuildSubs(const std::vector<std::string>& files) {
+  for (auto& file : files) {
+    TrainingDataReader reader(file);
+    std::vector<V6TrainingData> fileContents;
+    V6TrainingData data;
+    while (reader.ReadChunk(&data)) {
+      fileContents.push_back(data);
+    }
+    Validate(fileContents);
+    MoveList moves;
+    for (size_t i = 1; i < fileContents.size(); i++) {
+      moves.push_back(
+          DecodeMoveFromInput(PlanesFromTrainingData(fileContents[i]),
+                              PlanesFromTrainingData(fileContents[i - 1])));
+      // All moves decoded are from the point of view of the side after the
+      // move so need to mirror them all to be applicable to apply to the
+      // position before.
+      moves.back().Flip();
+    }
+    Validate(fileContents, moves);
+
+    // Subs are 'valid'.
+    PositionHistory history;
+    int rule50ply;
+    int gameply;
+    ChessBoard board;
+    auto input_format = static_cast<pblczero::NetworkFormat::InputFormat>(
+        fileContents[0].input_format);
+    PopulateBoard(input_format, PlanesFromTrainingData(fileContents[0]), &board,
+                  &rule50ply, &gameply);
+    history.Reset(board, rule50ply, gameply);
+    uint64_t rootHash = HashCat(board.Hash(), rule50ply);
+    PolicySubNode* rootNode = &policy_subs[rootHash];
+    for (size_t i = 0; i < fileContents.size(); i++) {
+      if ((fileContents[i].invariance_info & 64) == 0) {
+        rootNode->active = true;
+        for (int j = 0; j < 1858; j++) {
+          rootNode->policy[j] = fileContents[i].probabilities[j];
+        }
+      }
+      if (i < fileContents.size() - 1) {
+        int transform = TransformForPosition(input_format, history);
+        int idx = MoveToNNIndex(moves[i], transform);
+        if (rootNode->children[idx] == nullptr) {
+          rootNode->children[idx] = new PolicySubNode();
+        }
+        rootNode = rootNode->children[idx];
+        history.Append(moves[i]);
+      }
+    }
+  }
+}
+
+}  // namespace
+
+#ifdef _WIN32
+#define SEP_CHAR ';'
+#else
+#define SEP_CHAR ':'
+#endif
+
+void RunRescorer() {
+  OptionsParser options;
+  orig_counts[0] = 0;
+  orig_counts[1] = 0;
+  orig_counts[2] = 0;
+  fixed_counts[0] = 0;
+  fixed_counts[1] = 0;
+  fixed_counts[2] = 0;
+  for (int i = 0; i < 11; i++) policy_bump_total_hist[i] = 0;
+  for (int i = 0; i < 11; i++) policy_nobump_total_hist[i] = 0;
+  options.Add<StringOption>(kSyzygyTablebaseId);
+  options.Add<StringOption>(kGaviotaTablebaseId);
+  options.Add<StringOption>(kInputDirId);
+  options.Add<StringOption>(kOutputDirId);
+  options.Add<StringOption>(kPolicySubsDirId);
+  options.Add<IntOption>(kThreadsId, 1, 20) = 1;
+  options.Add<FloatOption>(kTempId, 0.001, 100) = 1;
+  // Positive dist offset requires knowing the legal move set, so not supported
+  // for now.
+  options.Add<FloatOption>(kDistributionOffsetId, -0.999, 0) = 0;
+  options.Add<FloatOption>(kMinDTZBoostId, 0, 1) = 0;
+  options.Add<IntOption>(kNewInputFormatId, -1, 256) = -1;
+  options.Add<BoolOption>(kDeblunder) = false;
+  options.Add<FloatOption>(kDeblunderQBlunderThreshold, 0.0f, 2.0f) = 2.0f;
+  options.Add<FloatOption>(kDeblunderQBlunderWidth, 0.0f, 2.0f) = 0.0f;
+  options.Add<StringOption>(kNnuePlainFileId);
+  options.Add<BoolOption>(kNnueBestScoreId) = true;
+  options.Add<BoolOption>(kNnueBestMoveId) = false;
+  options.Add<BoolOption>(kDeleteFilesId) = true;
+
+  if (!options.ProcessAllFlags()) return;
+
+  if (options.GetOptionsDict().IsDefault<std::string>(kOutputDirId) &&
+      options.GetOptionsDict().IsDefault<std::string>(kNnuePlainFileId)) {
+    std::cerr << "Must provide an output dir or NNUE plain file." << std::endl;
+    return;
+  }
+
+  if (options.GetOptionsDict().Get<bool>(kDeblunder)) {
+    RescorerDeblunderSetup(
+        options.GetOptionsDict().Get<float>(kDeblunderQBlunderThreshold),
+        options.GetOptionsDict().Get<float>(kDeblunderQBlunderWidth));
+  }
+
+  SyzygyTablebase tablebase;
+  if (!tablebase.init(
+          options.GetOptionsDict().Get<std::string>(kSyzygyTablebaseId)) ||
+      tablebase.max_cardinality() < 3) {
+    std::cerr << "FAILED TO LOAD SYZYGY" << std::endl;
+    return;
+  }
+
+  RescorerGaviotaSetup(
+      options.GetOptionsDict().Get<std::string>(kGaviotaTablebaseId));
+
+  RescorerPolicySubstitutionSetup(
+      options.GetOptionsDict().Get<std::string>(kPolicySubsDirId));
+
+  auto inputDir = options.GetOptionsDict().Get<std::string>(kInputDirId);
+  if (inputDir.empty()) {
+    std::cerr << "Must provide an input dir." << std::endl;
+    return;
+  }
+  auto files = GetFileList(inputDir);
+  if (files.empty()) {
+    std::cerr << "No files to process" << std::endl;
+    return;
+  }
+  std::transform(
+      files.begin(), files.end(), files.begin(),
+      [&inputDir](const std::string& file) { return inputDir + "/" + file; });
+  float dtz_boost = options.GetOptionsDict().Get<float>(kMinDTZBoostId);
+  unsigned int threads = options.GetOptionsDict().Get<int>(kThreadsId);
+  ProcessFileFlags flags;
+  flags.delete_files = options.GetOptionsDict().Get<bool>(kDeleteFilesId);
+  flags.nnue_best_score = options.GetOptionsDict().Get<bool>(kNnueBestScoreId);
+  flags.nnue_best_move = options.GetOptionsDict().Get<bool>(kNnueBestMoveId);
+  if (threads > 1) {
+    std::vector<std::thread> threads_;
+    int offset = 0;
+    while (threads_.size() < threads) {
+      int offset_val = offset;
+      offset++;
+      threads_.emplace_back([&options, offset_val, files, &tablebase, threads,
+                             dtz_boost, flags]() {
+        ProcessFiles(
+            files, &tablebase,
+            options.GetOptionsDict().Get<std::string>(kOutputDirId),
+            options.GetOptionsDict().Get<float>(kTempId),
+            options.GetOptionsDict().Get<float>(kDistributionOffsetId),
+            dtz_boost, options.GetOptionsDict().Get<int>(kNewInputFormatId),
+            offset_val, threads,
+            options.GetOptionsDict().Get<std::string>(kNnuePlainFileId), flags);
+      });
+    }
+    for (size_t i = 0; i < threads_.size(); i++) {
+      threads_[i].join();
+    }
+
+  } else {
+    ProcessFiles(
+        files, &tablebase,
+        options.GetOptionsDict().Get<std::string>(kOutputDirId),
+        options.GetOptionsDict().Get<float>(kTempId),
+        options.GetOptionsDict().Get<float>(kDistributionOffsetId), dtz_boost,
+        options.GetOptionsDict().Get<int>(kNewInputFormatId), 0, 1,
+        options.GetOptionsDict().Get<std::string>(kNnuePlainFileId), flags);
+  }
+  std::cout << "Games processed: " << games << std::endl;
+  std::cout << "Positions processed: " << positions << std::endl;
+  std::cout << "Rescores performed: " << rescored << std::endl;
+  std::cout << "Cumulative outcome change: " << delta << std::endl;
+  std::cout << "Secondary rescores performed: " << rescored2 << std::endl;
+  std::cout << "Secondary rescores performed used dtz: " << rescored3
+            << std::endl;
+  std::cout << "Blunders picked up by deblunder threshold: " << blunders
+            << std::endl;
+  std::cout << "Number of policy values boosted by dtz or dtm " << policy_bump
+            << std::endl;
+  std::cout << "Number of policy values boosted by dtm " << policy_dtm_bump
+            << std::endl;
+  std::cout << "Orig policy_sum dist of boost candidate:";
+  std::cout << std::endl;
+  int event_sum = 0;
+  for (int i = 0; i < 11; i++) event_sum += policy_bump_total_hist[i];
+  for (int i = 0; i < 11; i++) {
+    std::cout << " " << std::setprecision(4)
+              << ((float)policy_nobump_total_hist[i] / (float)event_sum);
+  }
+  std::cout << std::endl;
+  std::cout << "Boosted policy_sum dist of boost candidate:";
+  std::cout << std::endl;
+  for (int i = 0; i < 11; i++) {
+    std::cout << " " << std::setprecision(4)
+              << ((float)policy_bump_total_hist[i] / (float)event_sum);
+  }
+  std::cout << std::endl;
+  std::cout << "Original L: " << orig_counts[0] << " D: " << orig_counts[1]
+            << " W: " << orig_counts[2] << std::endl;
+  std::cout << "After L: " << fixed_counts[0] << " D: " << fixed_counts[1]
+            << " W: " << fixed_counts[2] << std::endl;
+  std::cout << "Gaviota DTM move_count rescores: " << gaviota_dtm_rescores
+            << std::endl;
+}
+
+std::vector<V6TrainingData> RescoreTrainingData(
+    std::vector<V6TrainingData> fileContents, SyzygyTablebase* tablebase,
+    float distTemp, float distOffset, float dtzBoost, int newInputFormat) {
+  FileData data =
+      ProcessFileInternal(std::move(fileContents), tablebase, distTemp,
+                          distOffset, dtzBoost, newInputFormat);
+  return data.fileContents;
+}
+
+bool RescorerDeblunderSetup(float threshold, float width) {
+  deblunderEnabled = true;
+  deblunderQBlunderThreshold = threshold;
+  deblunderQBlunderWidth = width;
+  return true;
+}
+
+bool RescorerGaviotaSetup(std::string dtmPaths) {
+  if (!dtmPaths.empty()) {
+    std::stringstream path_string_stream(dtmPaths);
+    std::string path;
+    auto paths = tbpaths_init();
+    while (std::getline(path_string_stream, path, SEP_CHAR)) {
+      paths = tbpaths_add(paths, path.c_str());
+    }
+    tb_init(0, tb_CP4, paths);
+    tbcache_init(64 * 1024 * 1024, 64);
+    if (tb_availability() != 63) {
+      throw Exception("UNEXPECTED gaviota availability");
+      return false;
+    } else {
+      std::cerr << "Found Gaviota TBs" << std::endl;
+    }
+    gaviotaEnabled = true;
+  }
+  return gaviotaEnabled;
+}
+
+bool RescorerPolicySubstitutionSetup(std::string policySubsDir) {
+  if (!policySubsDir.empty()) {
+    auto policySubFiles = GetFileList(policySubsDir);
+    std::transform(policySubFiles.begin(), policySubFiles.end(),
+                   policySubFiles.begin(),
+                   [&policySubsDir](const std::string& file) {
+                     return policySubsDir + "/" + file;
+                   });
+    BuildSubs(policySubFiles);
+  }
+  return !policy_subs.empty();
+}
+
+}  // namespace lczero
diff --git a/src/trainingdata/rescorer.h b/src/trainingdata/rescorer.h
new file mode 100644
index 0000000000..c38c0ff5c5
--- /dev/null
+++ b/src/trainingdata/rescorer.h
@@ -0,0 +1,48 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2018-2024 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#pragma once
+
+#include <vector>
+
+#include "syzygy/syzygy.h"
+#include "trainingdata/trainingdata_v6.h"
+
+namespace lczero {
+
+void RunRescorer();
+
+// Interface for external use.
+bool RescorerDeblunderSetup(float threshold, float width);
+bool RescorerGaviotaSetup(std::string dtmPaths);
+bool RescorerPolicySubstitutionSetup(std::string policySubsDir);
+std::vector<V6TrainingData> RescoreTrainingData(
+    std::vector<V6TrainingData> fileContents, SyzygyTablebase* tablebase,
+    float distTemp = 1.0f, float distOffset = 0.0f, float dtzBoost = 0.0f,
+    int newInputFormat = -1);
+
+}  // namespace lczero
diff --git a/src/trainingdata/trainingdata.cc b/src/trainingdata/trainingdata.cc
index 1285dc7b49..2f9662f42a 100644
--- a/src/trainingdata/trainingdata.cc
+++ b/src/trainingdata/trainingdata.cc
@@ -111,10 +111,14 @@ void V6TrainingDataArray::Write(TrainingDataWriter* writer, GameResult result,
   }
 }
 
-void V6TrainingDataArray::Add(const Node* node, const PositionHistory& history,
-                              Eval best_eval, Eval played_eval,
-                              bool best_is_proven, Move best_move,
-                              Move played_move, const NNCacheLock& nneval) {
+void V6TrainingDataArray::Add(const classic::Node* node,
+                              const PositionHistory& history,
+                              classic::Eval best_eval,
+                              classic::Eval played_eval, bool best_is_proven,
+                              Move best_move, Move played_move,
+                              std::span<Move> legal_moves,
+                              const std::optional<EvalResult>& nneval,
+                              float policy_softmax_temp) {
   V6TrainingData result;
   const auto& position = history.Last();
 
@@ -146,40 +150,22 @@ void V6TrainingDataArray::Add(const Node* node, const PositionHistory& history,
   // Set moves probabilities according to their relative amount of visits.
   // Compute Kullback-Leibler divergence in nats (between policy and visits).
   float kld_sum = 0;
-  float max_p = -std::numeric_limits<float>::infinity();
-  std::vector<float> intermediate;
-  if (nneval) {
-    int last_idx = 0;
-    for (const auto& child : node->Edges()) {
-      auto nn_idx = child.edge()->GetMove().as_nn_index(transform);
-      float p = 0;
-      for (int i = 0; i < nneval->p.size(); i++) {
-        // Optimization: usually moves are stored in the same order as queried.
-        const auto& move = nneval->p[last_idx++];
-        if (last_idx == nneval->p.size()) last_idx = 0;
-        if (move.first == nn_idx) {
-          p = move.second;
-          break;
-        }
-      }
-      intermediate.emplace_back(p);
-      max_p = std::max(max_p, p);
-    }
-  }
   float total = 0.0;
-  auto it = intermediate.begin();
   for (const auto& child : node->Edges()) {
-    auto nn_idx = child.edge()->GetMove().as_nn_index(transform);
+    const Move move = child.GetMove();
     float fracv = total_n > 0 ? child.GetN() / static_cast<float>(total_n) : 1;
     if (nneval) {
-      float P = std::exp(*it - max_p);
+      size_t move_idx =
+          std::find(legal_moves.begin(), legal_moves.end(), move) -
+          legal_moves.begin();
+      // Undo any softmax temperature in the cached data.
+      float P = std::pow(nneval->p[move_idx], policy_softmax_temp);
       if (fracv > 0) {
         kld_sum += fracv * std::log(fracv / P);
       }
       total += P;
-      it++;
     }
-    result.probabilities[nn_idx] = fracv;
+    result.probabilities[MoveToNNIndex(move, transform)] = fracv;
   }
   if (nneval) {
     // Add small epsilon for backward compatibility with earlier value of 0.
@@ -197,10 +183,10 @@ void V6TrainingDataArray::Add(const Node* node, const PositionHistory& history,
   uint8_t their_king_side = 1;
   // If frc trained, send the bit mask representing rook position.
   if (Is960CastlingFormat(input_format_)) {
-    our_queen_side <<= castlings.our_queenside_rook();
-    our_king_side <<= castlings.our_kingside_rook();
-    their_queen_side <<= castlings.their_queenside_rook();
-    their_king_side <<= castlings.their_kingside_rook();
+    our_queen_side <<= castlings.our_queenside_rook.idx;
+    our_king_side <<= castlings.our_kingside_rook.idx;
+    their_queen_side <<= castlings.their_queenside_rook.idx;
+    their_king_side <<= castlings.their_kingside_rook.idx;
   }
 
   result.castling_us_ooo = castlings.we_can_000() ? our_queen_side : 0;
@@ -234,7 +220,7 @@ void V6TrainingDataArray::Add(const Node* node, const PositionHistory& history,
   result.result_q = 0;
   result.result_d = 1;
 
-  Eval orig_eval;
+  classic::Eval orig_eval;
   if (nneval) {
     orig_eval.wl = nneval->q;
     orig_eval.d = nneval->d;
@@ -271,11 +257,11 @@ void V6TrainingDataArray::Add(const Node* node, const PositionHistory& history,
 
   result.visits = node->GetN();
   if (position.IsBlackToMove()) {
-    best_move.Mirror();
-    played_move.Mirror();
+    best_move.Flip();
+    played_move.Flip();
   }
-  result.best_idx = best_move.as_nn_index(transform);
-  result.played_idx = played_move.as_nn_index(transform);
+  result.best_idx = MoveToNNIndex(best_move, transform);
+  result.played_idx = MoveToNNIndex(played_move, transform);
   result.reserved = 0;
 
   // Unknown here - will be filled in once the full data has been collected.
diff --git a/src/trainingdata/trainingdata.h b/src/trainingdata/trainingdata.h
index 6fc3b3b8a5..8780bcb611 100644
--- a/src/trainingdata/trainingdata.h
+++ b/src/trainingdata/trainingdata.h
@@ -27,66 +27,13 @@
 
 #pragma once
 
-#include "mcts/node.h"
+#include "neural/backend.h"
+#include "search/classic/node.h"
 #include "trainingdata/writer.h"
+#include "trainingdata/trainingdata_v6.h"
 
 namespace lczero {
 
-#pragma pack(push, 1)
-
-struct V6TrainingData {
-  uint32_t version;
-  uint32_t input_format;
-  float probabilities[1858];
-  uint64_t planes[104];
-  uint8_t castling_us_ooo;
-  uint8_t castling_us_oo;
-  uint8_t castling_them_ooo;
-  uint8_t castling_them_oo;
-  // For input type 3 contains enpassant column as a mask.
-  uint8_t side_to_move_or_enpassant;
-  uint8_t rule50_count;
-  // Bitfield with the following allocation:
-  //  bit 7: side to move (input type 3)
-  //  bit 6: position marked for deletion by the rescorer (never set by lc0)
-  //  bit 5: game adjudicated (v6)
-  //  bit 4: max game length exceeded (v6)
-  //  bit 3: best_q is for proven best move (v6)
-  //  bit 2: transpose transform (input type 3)
-  //  bit 1: mirror transform (input type 3)
-  //  bit 0: flip transform (input type 3)
-  // In versions prior to v5 this spot contained an unused move count field.
-  uint8_t invariance_info;
-  // In versions prior to v6 this spot contained thr result as an int8_t.
-  uint8_t dummy;
-  float root_q;
-  float best_q;
-  float root_d;
-  float best_d;
-  float root_m;      // In plies.
-  float best_m;      // In plies.
-  float plies_left;  // This is the training target for MLH.
-  float result_q;
-  float result_d;
-  float played_q;
-  float played_d;
-  float played_m;
-  // The folowing may be NaN if not found in cache.
-  float orig_q;      // For value repair.
-  float orig_d;
-  float orig_m;
-  uint32_t visits;
-  // Indices in the probabilities array.
-  uint16_t played_idx;
-  uint16_t best_idx;
-  // Kullback-Leibler divergence between visits and policy (denominator)
-  float policy_kld;
-  uint32_t reserved;
-} PACKED_STRUCT;
-static_assert(sizeof(V6TrainingData) == 8356, "Wrong struct size");
-
-#pragma pack(pop)
-
 class V6TrainingDataArray {
  public:
   V6TrainingDataArray(FillEmptyHistory white_fill_empty_history,
@@ -96,9 +43,11 @@ class V6TrainingDataArray {
         input_format_(input_format) {}
 
   // Add a chunk.
-  void Add(const Node* node, const PositionHistory& history, Eval best_eval,
-           Eval played_eval, bool best_is_proven, Move best_move,
-           Move played_move, const NNCacheLock& nneval);
+  void Add(const classic::Node* node, const PositionHistory& history,
+           classic::Eval best_eval, classic::Eval played_eval,
+           bool best_is_proven, Move best_move, Move played_move,
+           std::span<Move> legal_moves,
+           const std::optional<EvalResult>& nneval, float policy_softmax_temp);
 
   // Writes training data to a file.
   void Write(TrainingDataWriter* writer, GameResult result,
diff --git a/src/trainingdata/trainingdata_v6.h b/src/trainingdata/trainingdata_v6.h
new file mode 100644
index 0000000000..0b5c986c0e
--- /dev/null
+++ b/src/trainingdata/trainingdata_v6.h
@@ -0,0 +1,90 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2021 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#pragma once
+
+#include <cstdint>
+#include "utils/cppattributes.h"
+
+namespace lczero {
+
+#pragma pack(push, 1)
+
+struct V6TrainingData {
+  uint32_t version;
+  uint32_t input_format;
+  float probabilities[1858];
+  uint64_t planes[104];
+  uint8_t castling_us_ooo;
+  uint8_t castling_us_oo;
+  uint8_t castling_them_ooo;
+  uint8_t castling_them_oo;
+  // For input type 3 contains enpassant column as a mask.
+  uint8_t side_to_move_or_enpassant;
+  uint8_t rule50_count;
+  // Bitfield with the following allocation:
+  //  bit 7: side to move (input type 3)
+  //  bit 6: position marked for deletion by the rescorer (never set by lc0)
+  //  bit 5: game adjudicated (v6)
+  //  bit 4: max game length exceeded (v6)
+  //  bit 3: best_q is for proven best move (v6)
+  //  bit 2: transpose transform (input type 3)
+  //  bit 1: mirror transform (input type 3)
+  //  bit 0: flip transform (input type 3)
+  // In versions prior to v5 this spot contained an unused move count field.
+  uint8_t invariance_info;
+  // In versions prior to v6 this spot contained thr result as an int8_t.
+  uint8_t dummy;
+  float root_q;
+  float best_q;
+  float root_d;
+  float best_d;
+  float root_m;      // In plies.
+  float best_m;      // In plies.
+  float plies_left;  // This is the training target for MLH.
+  float result_q;
+  float result_d;
+  float played_q;
+  float played_d;
+  float played_m;
+  // The folowing may be NaN if not found in cache.
+  float orig_q;  // For value repair.
+  float orig_d;
+  float orig_m;
+  uint32_t visits;
+  // Indices in the probabilities array.std::optional<EvalResult>
+  uint16_t played_idx;
+  uint16_t best_idx;
+  // Kullback-Leibler divergence between visits and policy (denominator)
+  float policy_kld;
+  uint32_t reserved;
+} PACKED_STRUCT;
+static_assert(sizeof(V6TrainingData) == 8356, "Wrong struct size");
+
+#pragma pack(pop)
+
+}  // namespace lczero
diff --git a/src/utils/atomic_vector.h b/src/utils/atomic_vector.h
new file mode 100644
index 0000000000..50c371fd3c
--- /dev/null
+++ b/src/utils/atomic_vector.h
@@ -0,0 +1,86 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2024 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#pragma once
+
+namespace lczero {
+
+template <typename T>
+class AtomicVector {
+ public:
+  explicit AtomicVector(size_t capacity) : capacity_(capacity), size_(0) {
+    data_ = new
+        typename std::aligned_storage<sizeof(T), alignof(T)>::type[capacity];
+  }
+
+  ~AtomicVector() {
+    clear();
+    delete[] data_;
+  }
+
+  // Thread safe, returns the index of the inserted element.
+  template <typename... Args>
+  size_t emplace_back(Args&&... args) {
+    size_t i = size_.fetch_add(1, std::memory_order_relaxed);
+    assert(i < capacity_);
+    new (&data_[i]) T(std::forward<Args>(args)...);
+    return i;
+  }
+
+  T& operator[](size_t i) {
+    assert(i < size());
+    return *reinterpret_cast<T*>(&data_[i]);
+  }
+
+  const T& operator[](size_t i) const {
+    assert(i < size());
+    return *reinterpret_cast<const T*>(&data_[i]);
+  }
+
+  size_t size() const { return size_.load(std::memory_order_relaxed); }
+  size_t capacity() const { return capacity_; }
+
+  // Not thread safe.
+  void clear() {
+    for (size_t i = size_.load(std::memory_order_relaxed); i-- > 0;) {
+      reinterpret_cast<T*>(&data_[i])->~T();
+    }
+    size_.store(0, std::memory_order_relaxed);
+  }
+
+  T* begin() { return reinterpret_cast<T*>(data_); }
+  T* end() { return reinterpret_cast<T*>(data_) + size(); }
+  const T* begin() const { return reinterpret_cast<const T*>(data_); }
+  const T* end() const { return reinterpret_cast<const T*>(data_) + size(); }
+
+ private:
+  const size_t capacity_;
+  std::atomic<size_t> size_;
+  typename std::aligned_storage<sizeof(T), alignof(T)>::type* data_;
+};
+
+}  // namespace lczero
\ No newline at end of file
diff --git a/src/utils/bit.h b/src/utils/bit.h
new file mode 100644
index 0000000000..7c335e2e0b
--- /dev/null
+++ b/src/utils/bit.h
@@ -0,0 +1,45 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2025 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#pragma once
+#include <bit>
+#include <cstring>
+
+namespace lczero {
+
+#if __cpp_lib_bit_cast >= 201806L
+using std::bit_cast;
+#else
+template <class To, class From>
+To bit_cast(const From& src) noexcept {
+  To dst;
+  std::memcpy((void*)&dst, &src, sizeof(To));
+  return dst;
+}
+#endif
+
+}  // namespace lczero
diff --git a/src/utils/bititer.h b/src/utils/bititer.h
index 6c69c11889..c4831052f8 100644
--- a/src/utils/bititer.h
+++ b/src/utils/bititer.h
@@ -90,20 +90,20 @@ inline uint64_t TransposeBitsInBytes(uint64_t v) {
 
 // Iterates over all set bits of the value, lower to upper. The value of
 // dereferenced iterator is bit number (lower to upper, 0 bazed)
-template <typename T>
+template <typename T, typename Convert = std::identity>
 class BitIterator {
  public:
-  using iterator_category = std::input_iterator_tag;
+  using iterator_category = std::forward_iterator_tag;
   using difference_type = T;
   using value_type = T;
   using pointer = T*;
   using reference = T&;
 
-  BitIterator(std::uint64_t value) : value_(value){};
+  BitIterator(std::uint64_t value) : value_(value) {};
   bool operator!=(const BitIterator& other) { return value_ != other.value_; }
 
   void operator++() { value_ &= (value_ - 1); }
-  T operator*() const { return GetLowestBit(value_); }
+  T operator*() const { return Convert()(GetLowestBit(value_)); }
 
  private:
   std::uint64_t value_;
@@ -112,8 +112,9 @@ class BitIterator {
 class IterateBits {
  public:
   IterateBits(std::uint64_t value) : value_(value) {}
-  BitIterator<int> begin() { return value_; }
-  BitIterator<int> end() { return 0; }
+  using Iterator = BitIterator<int>;
+  Iterator begin() { return value_; }
+  Iterator end() { return 0; }
 
  private:
   std::uint64_t value_;
diff --git a/src/utils/cache.h b/src/utils/cache.h
index 763033c3f8..e2a703a6d0 100644
--- a/src/utils/cache.h
+++ b/src/utils/cache.h
@@ -150,8 +150,7 @@ class HashKeyedCache {
     size_t idx = key % hash_.size();
     while (true) {
       if (!hash_[idx].in_use) break;
-      if (hash_[idx].key == key &&
-          hash_[idx].value.get() == value) {
+      if (hash_[idx].key == key && hash_[idx].value.get() == value) {
         --hash_[idx].pins;
         return;
       }
@@ -301,6 +300,7 @@ class HashKeyedCacheLock {
   HashKeyedCacheLock(const HashKeyedCacheLock&) = delete;
 
   // Returns whether lock holds any value.
+  bool holds_value() const { return value_; }
   operator bool() const { return value_; }
 
   // Gets the value.
diff --git a/src/utils/commandline.cc b/src/utils/commandline.cc
index 22df960462..48f11351be 100644
--- a/src/utils/commandline.cc
+++ b/src/utils/commandline.cc
@@ -56,7 +56,7 @@ void CommandLine::Init(int argc, const char** argv) {
   LOGFILE << "Command line: " << binary_ << params.str();
 }
 
-bool CommandLine::ConsumeCommand(const std::string& command) {
+bool CommandLine::ConsumeCommand(std::string_view command) {
   if (arguments_.empty()) return false;
   if (arguments_[0] != command) return false;
   arguments_.erase(arguments_.begin());
diff --git a/src/utils/commandline.h b/src/utils/commandline.h
index 205f155b26..c9621212b2 100644
--- a/src/utils/commandline.h
+++ b/src/utils/commandline.h
@@ -47,7 +47,7 @@ class CommandLine {
 
   // If the first command line parameter is @command, remove it and return
   // true. Otherwise return false.
-  static bool ConsumeCommand(const std::string& command);
+  static bool ConsumeCommand(std::string_view command);
 
   // Command line arguments.
   static const std::vector<std::string>& Arguments() { return arguments_; }
diff --git a/src/utils/configfile.cc b/src/utils/configfile.cc
index 1205c73f71..98bded5862 100644
--- a/src/utils/configfile.cc
+++ b/src/utils/configfile.cc
@@ -142,7 +142,7 @@ bool ConfigFile::ParseFile(std::string& filename) {
     if (line.substr(0, 1) == "#") continue;
     // Skip blank lines.
     if (line.length() == 0) continue;
-    // Allow long form arugments that omit '--'.  If omitted, add here.
+    // Allow long form arguments that omit '--'.  If omitted, add here.
     if (line.substr(0, 1) != "-" && line.substr(0, 2) != "--") {
       line = "--" + line;
     }
diff --git a/src/utils/fp16_utils.h b/src/utils/fp16_utils.h
index 2680536599..9efa0e6574 100644
--- a/src/utils/fp16_utils.h
+++ b/src/utils/fp16_utils.h
@@ -27,7 +27,8 @@
 #pragma once
 
 #include <cstdint>
-#include <cstring>
+
+#include "utils/bit.h"
 
 // Define NO_F16C to avoid the F16C intrinsics. Also disabled with NO_POPCNT
 // since it catches most processors without F16C instructions.
@@ -40,59 +41,18 @@
 
 namespace lczero {
 
-#if defined(NO_POPCNT) || defined(NO_F16C) || \
-    (defined(__GNUC__) && !defined(__F16C__))
+#if defined(HAS_FLOAT16) && (defined(__F16C__) || defined(__aarch64__))
 
 inline uint16_t FP32toFP16(float f32) {
-  unsigned int x;
-  unsigned int sign = 0;
-  memcpy(&x, &f32, sizeof(float));
-  if (x & 0x80000000) sign = 0x8000;
-  x &= 0x7fffffff;
-  if (x >= 0x477ff000) {
-    if ((x & 0x7f800000) == 0x7f800000 && (x & 0x7fffff)) {
-      x = ((x >> 13) - 0x38000) | 0x200;
-    } else {
-      x = 0x7c00;
-    }
-  } else if (x <= 0x33000000)
-    x = 0;
-  else if (x <= 0x387fefff) {
-    int shift = 126 - ((x >> 23) & 0xff);
-    x = (x & 0x7fffff) | 0x800000;
-    if (x & (0x17fffff >> (24 - shift))) x += 0x800000 >> (24 - shift);
-    x >>= shift;
-  } else {
-    // Adjust exponent and round to nearest even.
-    if (x & 0x2fff) {
-      x -= 0x37fff000;
-    } else {
-      x -= 0x38000000;
-    }
-    x >>= 13;
-  }
-  return x | sign;
+  return bit_cast<uint16_t>(static_cast<_Float16>(f32));
 }
 
 inline float FP16toFP32(uint16_t f16) {
-  unsigned int x;
-  float f;
-  x = f16 & 0x7fff;
-  if ((x & 0x7c00) == 0) {
-    f = 5.9604645e-8f * x;
-    memcpy(&x, &f, sizeof(float));
-  } else if (x >= 0x7c00) {
-    if (x & 0x1ff) x |= 0x200;
-    x = (x + 0x38000) << 13;
-  } else {
-    x = (x + 0x1c000) << 13;
-  }
-  if (f16 & 0x8000) x |= 0x80000000;
-  memcpy(&f, &x, sizeof(float));
-  return f;
+  return static_cast<float>(bit_cast<_Float16>(f16));
 }
 
-#else
+#elif !defined(NO_POPCNT) && !defined(NO_F16C) && \
+    (!defined(__GNUC__) || defined(__F16C__))
 
 inline uint16_t FP32toFP16(float f32) {
   __m128 A = _mm_set_ss(f32);
@@ -107,6 +67,56 @@ inline float FP16toFP32(uint16_t f16) {
   return _mm_cvtss_f32(A);
 }
 
+#else
+
+inline uint16_t FP32toFP16(float f32) {
+  uint32_t x = bit_cast<uint32_t>(f32);
+  uint32_t sign = (x & 0x80000000) >> 16;
+  x &= 0x7fffffff;
+  if (x < 0x477ff000) {
+    if (x >= 0x387ff000) {
+      // Normal fp16 result. Adjust exponent and round to nearest even.
+      // Branchless idea from <https://gist.github.com/rygorous/2156668>.
+      x += (x >> 13) & 1;
+      x -= 0x37fff001;
+      x >>= 13;
+    } else {
+      // Subnormal or zero. The result is the last bits of fabs(f32) + 0.5f.
+      x = bit_cast<uint32_t>(bit_cast<float>(x) + 0.5f);
+    }
+  } else {
+    if (x > 0x7f800000) {
+      // NaN
+      x = ((x >> 13) - 0x38000) | 0x200;
+    } else {
+      // Inf
+      x = 0x7c00;
+    }
+  }
+  return x | sign;
+}
+
+inline float FP16toFP32(uint16_t f16) {
+  int32_t s = static_cast<int16_t>(f16);
+  uint32_t x;
+  float f;
+  if ((s & 0x7c00) == 0) {
+    // Subnormal or zero. Scale to float.
+    x = s & 0x7fff;
+    f = 5.9604645e-8f * x;
+    x = bit_cast<uint32_t>(f);
+    if (s & 0x8000) x |= 0x80000000;
+  } else if ((s & 0x7c00) == 0x7c00) {
+    // Inf or NaN. Adjust exponent and shift.
+    if (s & 0x1ff) s |= 0x200;  // Change sNaN to qNaN as intel does.
+    x = ((s & 0x47fff) + 0x38000) << 13;
+  } else {
+    // Normal. Adjust exponent and shift.
+    x = ((s & 0x47fff) + 0x1c000U) << 13;
+  }
+  return bit_cast<float>(x);
+}
+
 #endif
 
 }  // namespace lczero
diff --git a/src/utils/fp16_utils_test.cc b/src/utils/fp16_utils_test.cc
new file mode 100644
index 0000000000..32aa66f240
--- /dev/null
+++ b/src/utils/fp16_utils_test.cc
@@ -0,0 +1,107 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2018 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include "utils/fp16_utils.h"
+
+#include <gtest/gtest.h>
+
+namespace lczero {
+
+testing::AssertionResult FP16Equal(const char* a_expr, const char* b_expr,
+                                   uint16_t a, uint16_t b) {
+  if (a == b) return testing::AssertionSuccess();
+  std::ostringstream oss_a;
+  oss_a << std::hex << a;
+  std::ostringstream oss_b;
+  oss_b << std::hex << b;
+  return testing::AssertionFailure()
+         << "Expected FP16 values to be equal:\n"
+         << "  " << a_expr << "\n"
+         << "     Which is: 0x" << oss_a.str() << "\n"
+         << "  " << b_expr << "\n"
+         << "     Which is: 0x" << oss_b.str() << "\n";
+}
+
+TEST(FP16, TestNormalConversion) {
+  float values[] = {0.0f,
+                    -0.000000029802322f,
+                    0.000000029802326f,
+                    -0.000000059604645f,
+                    0.000060975552f,
+                    -0.00006103515625f,
+                    0.1f,
+                    -0.5f,
+                    0.99951172f,
+                    -1.0f,
+                    1.00097656f,
+                    -2.0f,
+                    3.5f,
+                    -4.25f,
+                    65488.0f,
+                    -65488.004f,
+                    65504.0f,
+                    -65519.996f,
+                    65520.0f,
+                    -std::numeric_limits<float>::infinity()};
+  uint16_t expected_fp16[] = {0x0,    0x8000, 0x1,    0x8001, 0x3ff,
+                              0x8400, 0x2E66, 0xB800, 0x3BFF, 0xBC00,
+                              0x3C01, 0xC000, 0x4300, 0xC440, 0x7BFE,
+                              0xFBFF, 0x7BFF, 0xFBFF, 0x7C00, 0xFC00};
+  float expected_fp32[] = {0.0f,
+                           -0.0f,
+                           0.000000059604645f,
+                           -0.000000059604645f,
+                           0.000060975552f,
+                           -0.00006103515625f,
+                           0.0999755859f,
+                           -0.5f,
+                           0.99951172f,
+                           -1.0f,
+                           1.00097656f,
+                           -2.0f,
+                           3.5f,
+                           -4.25f,
+                           65472.0f,
+                           -65504.0f,
+                           65504.0f,
+                           -65504.0f,
+                           std::numeric_limits<float>::infinity(),
+                           -std::numeric_limits<float>::infinity()};
+  for (size_t i = 0; i < sizeof(values) / sizeof(values[0]); ++i) {
+    uint16_t fp16 = FP32toFP16(values[i]);
+    EXPECT_PRED_FORMAT2(FP16Equal, fp16, expected_fp16[i]) << " at index " << i;
+    float back = FP16toFP32(fp16);
+    EXPECT_FLOAT_EQ(back, expected_fp32[i]) << " at index " << i;
+  }
+}
+
+}  // namespace lczero
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/utils/logging.h b/src/utils/logging.h
index a9508de12c..907aaf8f6c 100644
--- a/src/utils/logging.h
+++ b/src/utils/logging.h
@@ -27,6 +27,7 @@
 
 #pragma once
 
+#include <chrono>
 #include <deque>
 #include <fstream>
 #include <iomanip>
@@ -89,4 +90,4 @@ std::string FormatTime(std::chrono::time_point<std::chrono::system_clock> time);
 
 #define LOGFILE ::lczero::LogMessage(__FILE__, __LINE__)
 #define CERR ::lczero::StderrLogMessage(__FILE__, __LINE__)
-#define COUT ::lczero::StdoutLogMessage(__FILE__, __LINE__)
\ No newline at end of file
+#define COUT ::lczero::StdoutLogMessage(__FILE__, __LINE__)
diff --git a/src/utils/mutex.h b/src/utils/mutex.h
index 4151af3fb8..4177e35d14 100644
--- a/src/utils/mutex.h
+++ b/src/utils/mutex.h
@@ -125,8 +125,12 @@ class CAPABILITY("mutex") SharedMutex {
 };
 
 static inline void SpinloopPause() {
-#if defined(__x86_64__)
+#if defined(__x86_64__) || defined(_M_X64)
   _mm_pause();
+#elif defined(_MSC_VER)
+  __asm {}
+#else
+  asm volatile("");
 #endif
 }
 
diff --git a/src/utils/optionsdict.cc b/src/utils/optionsdict.cc
index b515914c36..1c820c59af 100644
--- a/src/utils/optionsdict.cc
+++ b/src/utils/optionsdict.cc
@@ -74,7 +74,7 @@ std::vector<std::string> OptionsDict::ListSubdicts() const {
 }
 
 bool OptionsDict::HasSubdict(const std::string& name) const {
-  return subdicts_.find(name) != subdicts_.end();
+  return subdicts_.contains(name);
 }
 
 namespace {
diff --git a/src/utils/optionsdict.h b/src/utils/optionsdict.h
index e0ae3e2ad0..e4496be85f 100644
--- a/src/utils/optionsdict.h
+++ b/src/utils/optionsdict.h
@@ -27,6 +27,7 @@
 
 #pragma once
 
+#include <atomic>
 #include <map>
 #include <memory>
 #include <optional>
@@ -43,21 +44,35 @@ class TypeDict {
  protected:
   struct V {
     const T& Get() const {
-      was_read_since_last_set_ = true;
+      was_read_since_last_set_.store(true, std::memory_order::release);
       return value_;
     }
     T& Get() {
-      was_read_since_last_set_ = true;
+      was_read_since_last_set_.store(true, std::memory_order::release);
       return value_;
     }
     void Set(const T& v) {
-      was_read_since_last_set_ = false;
       value_ = v;
+      was_read_since_last_set_.store(false, std::memory_order::release);
+    }
+    bool WasReadSinceLastSet() const {
+      return was_read_since_last_set_.load(std::memory_order::acquire);
     }
-    bool WasReadSinceLastSet() const { return was_read_since_last_set_; }
 
+    V() = default;
+    V(const V& o) :
+      was_read_since_last_set_{o.was_read_since_last_set_.load(std::memory_order::acquire)},
+      value_{o.value_} {
+    }
+    V& operator=(const V& o) {
+      value_ = o.value_;
+      was_read_since_last_set_.store(o.was_read_since_last_set_.load(std::memory_order::acquire),
+                                     std::memory_order::release);
+      return *this;
+    }
+    V(const T& v) : value_{v} {}
    private:
-    mutable bool was_read_since_last_set_ = false;
+    mutable std::atomic<bool> was_read_since_last_set_ = false;
     T value_;
   };
   void EnsureNoUnusedOptions(const std::string& type_name,
@@ -79,6 +94,34 @@ class TypeDict {
 
 class OptionId {
  public:
+  enum VisibilityMode {
+    kSimpleMode = 1 << 0,  // Simple mode.
+    kNormalMode = 1 << 1,  // Normal mode.
+    kProMode = 1 << 2,     // Pro mode.
+  };
+
+  enum VisibilityMask {
+    kSimpleOnly = kSimpleMode,
+    kDefaultVisibility = kNormalMode | kProMode,
+    kProOnly = kProMode,
+    kAlwaysVisible = kSimpleMode | kNormalMode | kProMode,
+  };
+
+  struct OptionsParams {
+    const char* long_flag = nullptr;
+    const char* uci_option = nullptr;
+    const char* help_text = nullptr;
+    char short_flag = '\0';
+    VisibilityMask visibility = kDefaultVisibility;
+  };
+
+  OptionId(const OptionsParams& params)
+      : long_flag_(params.long_flag),
+        uci_option_(params.uci_option),
+        help_text_(params.help_text),
+        short_flag_(params.short_flag),
+        visibility_mask_(params.visibility) {}
+
   OptionId(const char* long_flag, const char* uci_option, const char* help_text,
            const char short_flag = '\0')
       : long_flag_(long_flag),
@@ -93,12 +136,14 @@ class OptionId {
   const char* uci_option() const { return uci_option_; }
   const char* help_text() const { return help_text_; }
   char short_flag() const { return short_flag_; }
+  uint64_t visibility_mask() const { return visibility_mask_; }
 
  private:
   const char* const long_flag_;
   const char* const uci_option_;
   const char* const help_text_;
   const char short_flag_;
+  uint64_t visibility_mask_ = kDefaultVisibility;
 };
 
 class Button {
diff --git a/src/utils/optionsparser.cc b/src/utils/optionsparser.cc
index fddd8de1db..144fa0918d 100644
--- a/src/utils/optionsparser.cc
+++ b/src/utils/optionsparser.cc
@@ -27,6 +27,7 @@
 
 #include "optionsparser.h"
 
+#include <charconv>
 #include <iomanip>
 #include <iostream>
 #include <sstream>
@@ -36,12 +37,6 @@
 #include "utils/logging.h"
 #include "utils/string.h"
 
-#if __has_include(<charconv>)
-#include <charconv>
-#else
-#define NO_CHARCONV
-#endif
-
 namespace lczero {
 namespace {
 const int kHelpIndent = 15;
@@ -56,7 +51,8 @@ OptionsParser::OptionsParser() : values_(*defaults_.AddSubdict("values")) {}
 std::vector<std::string> OptionsParser::ListOptionsUci() const {
   std::vector<std::string> result;
   for (const auto& iter : options_) {
-    if (!iter->GetUciOption().empty() && !iter->hidden_) {
+    if (!iter->GetUciOption().empty() &&
+        (iter->GetId().visibility_mask() & visibility_mode_)) {
       result.emplace_back("option name " + iter->GetUciOption() + " " +
                           iter->GetOptionString(values_));
     }
@@ -75,22 +71,6 @@ void OptionsParser::SetUciOption(const std::string& name,
   throw Exception("Unknown option: " + name);
 }
 
-void OptionsParser::HideOption(const OptionId& id) {
-  const auto option = FindOptionById(id);
-  if (option) option->hidden_ = true;
-}
-
-void OptionsParser::HideAllOptions() {
-  for (const auto& option : options_) {
-    option->hidden_ = true;
-  }
-}
-
-void OptionsParser::UnhideOption(const OptionId& id) {
-  const auto option = FindOptionById(id);
-  if (option) option->hidden_ = false;
-}
-
 OptionsParser::Option* OptionsParser::FindOptionByLongFlag(
     const std::string& flag) const {
   for (const auto& val : options_) {
@@ -140,13 +120,15 @@ bool OptionsParser::ProcessAllFlags() {
 
 bool OptionsParser::ProcessFlags(const std::vector<std::string>& args) {
   auto show_help = false;
-  if (CommandLine::BinaryName().find("pro") != std::string::npos) {
-    ShowHidden();
+  if (CommandLine::BinaryName().find("simple") != std::string::npos) {
+    visibility_mode_ = OptionId::kSimpleMode;
+  } else if (CommandLine::BinaryName().find("pro") != std::string::npos) {
+    visibility_mode_ = OptionId::kProMode;
   }
   for (auto iter = args.begin(), end = args.end(); iter != end; ++iter) {
     std::string param = *iter;
     if (param == "--show-hidden") {
-      ShowHidden();
+      visibility_mode_ = OptionId::kProMode;
       continue;
     }
     if (param == "-h" || param == "--help") {
@@ -288,7 +270,9 @@ void OptionsParser::ShowHelp() const {
   std::cout << FormatFlag('\0', "show-hidden",
                           "Show hidden options. Use with --help.");
   for (const auto& option : options_) {
-    if (!option->hidden_) std::cout << option->GetHelp(defaults_);
+    if ((option->GetId().visibility_mask() & visibility_mode_)) {
+      std::cout << option->GetHelp(values_);
+    }
   }
 
   auto contexts = values_.ListSubdicts();
@@ -300,10 +284,6 @@ void OptionsParser::ShowHelp() const {
   }
 }
 
-void OptionsParser::ShowHidden() const {
-  for (const auto& option : options_) option->hidden_ = false;
-}
-
 /////////////////////////////////////////////////////////////////
 // StringOption
 /////////////////////////////////////////////////////////////////
@@ -414,7 +394,6 @@ void IntOption::SetVal(OptionsDict* dict, const ValueType& val) const {
   dict->Set<ValueType>(GetId(), val);
 }
 
-#ifndef NO_CHARCONV
 int IntOption::ValidateIntString(const std::string& val) const {
   int result;
   const auto end = val.data() + val.size();
@@ -429,20 +408,6 @@ int IntOption::ValidateIntString(const std::string& val) const {
     return result;
   }
 }
-#else
-int IntOption::ValidateIntString(const std::string& val) const {
-  char* end;
-  errno = 0;
-  int result = std::strtol(val.c_str(), &end, 10);
-  if (errno == ERANGE) {
-    throw Exception("Flag '--" + GetLongFlag() + "' is out of range.");
-  } else if (val.length() == 0 || *end != '\0') {
-    throw Exception("Flag '--" + GetLongFlag() + "' value is invalid.");
-  } else {
-    return result;
-  }
-}
-#endif
 
 /////////////////////////////////////////////////////////////////
 // FloatOption
diff --git a/src/utils/optionsparser.h b/src/utils/optionsparser.h
index e689d21e59..0387c09862 100644
--- a/src/utils/optionsparser.h
+++ b/src/utils/optionsparser.h
@@ -44,15 +44,19 @@ class OptionsParser {
   class Option {
    public:
     Option(const OptionId& id);
-    virtual ~Option(){};
+    virtual ~Option() {};
     // Set value from string.
     virtual void SetValue(const std::string& value, OptionsDict* dict) = 0;
 
    protected:
     const OptionId& GetId() const { return id_; }
-    std::string GetUciOption() const { return id_.uci_option(); }
+    std::string GetUciOption() const {
+      return id_.uci_option() ? id_.uci_option() : "";
+    }
     std::string GetHelpText() const { return id_.help_text(); }
-    std::string GetLongFlag() const { return id_.long_flag(); }
+    std::string GetLongFlag() const {
+      return id_.long_flag() ? id_.long_flag() : "";
+    }
     char GetShortFlag() const { return id_.short_flag(); }
 
    private:
@@ -73,7 +77,6 @@ class OptionsParser {
     virtual std::string GetHelp(const OptionsDict& dict) const = 0;
 
     const OptionId& id_;
-    bool hidden_ = false;
     friend class OptionsParser;
   };
 
@@ -95,12 +98,6 @@ class OptionsParser {
   // Set the UCI option from string value.
   void SetUciOption(const std::string& name, const std::string& value,
                     const std::string& context = "");
-  // Hide this option from help and UCI.
-  void HideOption(const OptionId& id);
-  // Hide all options defined so far from help and UCI.
-  void HideAllOptions();
-  // Make this option visible from help and UCI.
-  void UnhideOption(const OptionId& id);
   // Processes all flags from the command line and an optional
   // configuration file. Returns false if there is an invalid flag.
   bool ProcessAllFlags();
@@ -119,8 +116,6 @@ class OptionsParser {
   void ShowHelp() const;
 
  private:
-  // Make all hidden options visible.
-  void ShowHidden() const;
   // Returns an option based on the long flag.
   Option* FindOptionByLongFlag(const std::string& flag) const;
   // Returns an option based by its uci name.
@@ -131,6 +126,7 @@ class OptionsParser {
   std::vector<std::unique_ptr<Option>> options_;
   OptionsDict defaults_;
   OptionsDict& values_;
+  OptionId::VisibilityMode visibility_mode_ = OptionId::kNormalMode;
 };
 
 class StringOption : public OptionsParser::Option {
diff --git a/src/utils/spinhelper.h b/src/utils/spinhelper.h
index c5e8f53633..7c2d7a8757 100644
--- a/src/utils/spinhelper.h
+++ b/src/utils/spinhelper.h
@@ -44,20 +44,17 @@ class SpinHelper {
 class ExponentialBackoffSpinHelper : public SpinHelper {
  public:
   ExponentialBackoffSpinHelper()
-    : backoff_iters_(kMinBackoffIters),
-      spin_to_sleep_iters_(0) {
-  }
+      : backoff_iters_(kMinBackoffIters), spin_to_sleep_iters_(0) {}
 
   virtual void Backoff() {
     thread_local std::uniform_int_distribution<size_t> distribution;
     thread_local std::minstd_rand generator(std::random_device{}());
-    const size_t spin_count = distribution(generator, decltype(distribution)::param_type{0, backoff_iters_});
+    const size_t spin_count = distribution(
+        generator, decltype(distribution)::param_type{0, backoff_iters_});
 
-    for (volatile size_t i=0; i<spin_count; i++) {
-      SpinloopPause();
-    }
+    for (size_t i = 0; i < spin_count; i++) SpinloopPause();
 
-    backoff_iters_ = std::min(2*backoff_iters_, kMaxBackoffIters);
+    backoff_iters_ = std::min(2 * backoff_iters_, kMaxBackoffIters);
     spin_to_sleep_iters_ = 0;
   }
 
diff --git a/src/utils/trace.cc b/src/utils/trace.cc
new file mode 100644
index 0000000000..373cd0fb78
--- /dev/null
+++ b/src/utils/trace.cc
@@ -0,0 +1,30 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2025 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include "trace.h"
+
+LCTRACE_DECLARE_CATEGORIES;
diff --git a/src/utils/trace.h b/src/utils/trace.h
new file mode 100644
index 0000000000..218aade57d
--- /dev/null
+++ b/src/utils/trace.h
@@ -0,0 +1,72 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2025 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#pragma once
+
+#include "trace_config.h"
+
+#if USE_PERFETTO_TRACE
+#include <perfetto.h>
+
+PERFETTO_DEFINE_CATEGORIES(
+    perfetto::Category("lc0").SetDescription("Leela Chess Zero"));
+#endif
+
+#if USE_NVTX_TRACE
+#include <nvtx3/nvtx3.hpp>
+#endif
+
+namespace lczero {
+#if USE_PERFETTO_TRACE
+#define LCTRACE_DECLARE_CATEGORIES PERFETTO_TRACK_EVENT_STATIC_STORAGE();
+
+#define LCTRACE_INITIALIZE                     \
+  do {                                         \
+    perfetto::TracingInitArgs args;            \
+    args.backends |= perfetto::kSystemBackend; \
+    perfetto::Tracing::Initialize(args);       \
+    perfetto::TrackEvent::Register();          \
+  } while (false)
+
+#define LCTRACE_FUNCTION_SCOPE \
+  const auto& name = __func__; \
+  TRACE_EVENT("lc0", name)
+
+#elif USE_NVTX_TRACE
+#define LCTRACE_DECLARE_CATEGORIES /* nop */
+#define LCTRACE_INITIALIZE         /* nop */
+struct lc0_domain {
+  static constexpr char name[] = "lc0";
+};
+#define LCTRACE_FUNCTION_SCOPE NVTX3_FUNC_RANGE_IN(lc0_domain)
+#else
+
+#define LCTRACE_DECLARE_CATEGORIES
+#define LCTRACE_INITIALIZE
+#define LCTRACE_FUNCTION_SCOPE
+#endif
+}  // namespace lczero
diff --git a/src/utils/weights_adapter.cc b/src/utils/weights_adapter.cc
index 2a5d196f14..a0f7adb278 100644
--- a/src/utils/weights_adapter.cc
+++ b/src/utils/weights_adapter.cc
@@ -25,19 +25,63 @@
   Program grant you additional permission to convey the resulting work.
 */
 
-#include "src/utils/weights_adapter.h"
+#include "utils/weights_adapter.h"
+
+#include <absl/base/optimization.h>
+
+#include "utils/bf16_utils.h"
+#include "utils/exception.h"
+#include "utils/fp16_utils.h"
 
 namespace lczero {
-float LayerAdapter::Iterator::ExtractValue(const uint16_t* ptr,
+
+float LayerAdapter::Iterator::ExtractValue(const std::byte* ptr,
                                            const LayerAdapter* adapter) {
-  return *ptr / static_cast<float>(0xffff) * adapter->range_ + adapter->min_;
+  switch (adapter->encoding_) {
+    case pblczero::Weights::Layer::LINEAR16: {
+      float theta =
+          *reinterpret_cast<const uint16_t*>(ptr) / static_cast<float>(0xffff);
+      return adapter->min_ * (1 - theta) + adapter->max_ * theta;
+    }
+    case pblczero::Weights::Layer::FLOAT16:
+      return FP16toFP32(*reinterpret_cast<const uint16_t*>(ptr));
+    case pblczero::Weights::Layer::BFLOAT16:
+      return BF16toFP32(*reinterpret_cast<const uint16_t*>(ptr));
+    case pblczero::Weights::Layer::FLOAT32: {
+      return *reinterpret_cast<const float*>(ptr);
+    }
+    [[unlikely]] default:  // To silence a couple of warnings.
+#if defined(ABSL_UNREACHABLE)
+      ABSL_UNREACHABLE();
+#elif defined(__GNUC__)
+      __builtin_unreachable();
+#else
+      __assume(false);
+#endif
+  }
 }
 
 LayerAdapter::LayerAdapter(const pblczero::Weights::Layer& layer)
-    : data_(reinterpret_cast<const uint16_t*>(layer.params().data())),
-      size_(layer.params().size() / sizeof(uint16_t)),
+    : encoding_(layer.has_encoding() ? layer.encoding()
+                                     : pblczero::Weights::Layer::LINEAR16),
+      element_size_(encoding_ == pblczero::Weights::Layer::FLOAT32
+                        ? sizeof(float)
+                        : sizeof(uint16_t)),
+      data_(reinterpret_cast<const std::byte*>(layer.params().data())),
+      size_(layer.params().size() / element_size_),
       min_(layer.min_val()),
-      range_(layer.max_val() - min_) {}
+      max_(layer.max_val()) {
+  switch (encoding_) {
+    case pblczero::Weights::Layer::LINEAR16:
+    case pblczero::Weights::Layer::FLOAT16:
+    case pblczero::Weights::Layer::BFLOAT16:
+    case pblczero::Weights::Layer::FLOAT32:
+      break;
+    default:
+      throw Exception("Unknown layer encoding " +
+                      pblczero::Weights::Layer::Encoding_Name(encoding_));
+  }
+}
 
 std::vector<float> LayerAdapter::as_vector() const {
   return std::vector<float>(begin(), end());
@@ -46,7 +90,7 @@ float LayerAdapter::Iterator::operator*() const {
   return ExtractValue(data_, adapter_);
 }
 float LayerAdapter::Iterator::operator[](size_t idx) const {
-  return ExtractValue(data_ + idx, adapter_);
+  return ExtractValue(data_ + idx * adapter_->element_size_, adapter_);
 }
 
 }  // namespace lczero
diff --git a/src/utils/weights_adapter.h b/src/utils/weights_adapter.h
index b26b172dfa..ded90989da 100644
--- a/src/utils/weights_adapter.h
+++ b/src/utils/weights_adapter.h
@@ -27,6 +27,7 @@
 
 #pragma once
 
+#include <cstdint>
 #include <iterator>
 #include <vector>
 
@@ -56,27 +57,28 @@ class LayerAdapter {
       return data_ != other.data_;
     }
     Iterator& operator++() {
-      ++data_;
+      data_ += adapter_->element_size_;
       return *this;
     }
     Iterator& operator--() {
-      --data_;
+      data_ -= adapter_->element_size_;
       return *this;
     }
     ptrdiff_t operator-(const Iterator& other) const {
-      return data_ - other.data_;
+      return (data_ - other.data_) / adapter_->element_size_;
     }
 
     // TODO(crem) implement other iterator functions when they are needed.
 
    private:
     friend class LayerAdapter;
-    Iterator(const LayerAdapter* adapter, const uint16_t* ptr)
+    Iterator(const LayerAdapter* adapter, const std::byte* ptr)
         : adapter_(adapter), data_(ptr) {}
-    static float ExtractValue(const uint16_t* ptr, const LayerAdapter* adapter);
+    static float ExtractValue(const std::byte* ptr,
+                              const LayerAdapter* adapter);
 
     const LayerAdapter* adapter_ = nullptr;
-    const uint16_t* data_ = nullptr;
+    const std::byte* data_ = nullptr;
   };
 
   LayerAdapter(const pblczero::Weights::Layer& layer);
@@ -84,13 +86,15 @@ class LayerAdapter {
   size_t size() const { return size_; }
   float operator[](size_t idx) const { return begin()[idx]; }
   Iterator begin() const { return {this, data_}; }
-  Iterator end() const { return {this, data_ + size_}; }
+  Iterator end() const { return {this, data_ + size_ * element_size_}; }
 
  private:
-  const uint16_t* data_ = nullptr;
+  const pblczero::Weights::Layer::Encoding encoding_;
+  const size_t element_size_ = 0;
+  const std::byte* data_ = nullptr;
   const size_t size_ = 0;
   const float min_;
-  const float range_;
+  const float max_;
 };
 
 }  // namespace lczero
diff --git a/src/version.inc b/src/version.inc
index a653627335..ccb71f6d74 100644
--- a/src/version.inc
+++ b/src/version.inc
@@ -1,4 +1,4 @@
 #define LC0_VERSION_MAJOR 0
-#define LC0_VERSION_MINOR 32
+#define LC0_VERSION_MINOR 33
 #define LC0_VERSION_PATCH 0
 #define LC0_VERSION_POSTFIX "dev"
diff --git a/subprojects/abseil-cpp.wrap b/subprojects/abseil-cpp.wrap
new file mode 100644
index 0000000000..18b5a1dda4
--- /dev/null
+++ b/subprojects/abseil-cpp.wrap
@@ -0,0 +1,105 @@
+[wrap-file]
+directory = abseil-cpp-20240722.0
+source_url = https://github.com/abseil/abseil-cpp/releases/download/20240722.0/abseil-cpp-20240722.0.tar.gz
+source_filename = abseil-cpp-20240722.0.tar.gz
+source_hash = f50e5ac311a81382da7fa75b97310e4b9006474f9560ac46f54a9967f07d4ae3
+patch_directory = abseil-cpp-20240722.0
+source_fallback_url = https://github.com/mesonbuild/wrapdb/releases/download/abseil-cpp_20240722.0-3/abseil-cpp-20240722.0.tar.gz
+
+[provide]
+absl_base = absl_base_dep
+absl_container = absl_container_dep
+absl_debugging = absl_debugging_dep
+absl_log = absl_log_dep
+absl_flags = absl_flags_dep
+absl_hash = absl_hash_dep
+absl_crc = absl_crc_dep
+absl_numeric = absl_numeric_dep
+absl_profiling = absl_profiling_dep
+absl_random = absl_random_dep
+absl_status = absl_status_dep
+absl_strings = absl_strings_dep
+absl_synchronization = absl_synchronization_dep
+absl_time = absl_time_dep
+absl_types = absl_types_dep
+absl_algorithm_container = absl_base_dep
+absl_any_invocable = absl_base_dep
+absl_bad_any_cast_impl = absl_types_dep
+absl_bad_optional_access = absl_types_dep
+absl_bad_variant_access = absl_types_dep
+absl_bind_front = absl_base_dep
+absl_city = absl_hash_dep
+absl_civil_time = absl_time_dep
+absl_cleanup = absl_base_dep
+absl_cord = absl_strings_dep
+absl_cord_internal = absl_strings_dep
+absl_cordz_functions = absl_strings_dep
+absl_cordz_handle = absl_strings_dep
+absl_cordz_info = absl_strings_dep
+absl_cordz_sample_token = absl_strings_dep
+absl_core_headers = absl_base_dep
+absl_crc32c = absl_crc_dep
+absl_debugging_internal = absl_debugging_dep
+absl_demangle_internal = absl_debugging_dep
+absl_die_if_null = absl_log_dep
+absl_examine_stack = absl_debugging_dep
+absl_exponential_biased = absl_profiling_dep
+absl_failure_signal_handler = absl_debugging_dep
+absl_flags_commandlineflag = absl_flags_dep
+absl_flags_commandlineflag_internal = absl_flags_dep
+absl_flags_config = absl_flags_dep
+absl_flags_internal = absl_flags_dep
+absl_flags_marshalling = absl_flags_dep
+absl_flags_parse = absl_flags_dep
+absl_flags_private_handle_accessor = absl_flags_dep
+absl_flags_program_name = absl_flags_dep
+absl_flags_reflection = absl_flags_dep
+absl_flags_usage = absl_flags_dep
+absl_flags_usage_internal = absl_flags_dep
+absl_flat_hash_map = absl_container_dep
+absl_flat_hash_set = absl_container_dep
+absl_function_ref = absl_base_dep
+absl_graphcycles_internal = absl_synchronization_dep
+absl_hashtablez_sampler = absl_container_dep
+absl_inlined_vector = absl_container_dep
+absl_int128 = absl_numeric_dep
+absl_leak_check = absl_debugging_dep
+absl_log_initialize = absl_log_dep
+absl_log_internal_check_op = absl_log_dep
+absl_log_internal_message = absl_log_dep
+absl_log_severity = absl_base_dep
+absl_low_level_hash = absl_hash_dep
+absl_memory = absl_base_dep
+absl_optional = absl_types_dep
+absl_periodic_sampler = absl_profiling_dep
+absl_random_bit_gen_ref = absl_random_dep
+absl_random_distributions = absl_random_dep
+absl_random_internal_distribution_test_util = absl_random_dep
+absl_random_internal_platform = absl_random_dep
+absl_random_internal_pool_urbg = absl_random_dep
+absl_random_internal_randen = absl_random_dep
+absl_random_internal_randen_hwaes = absl_random_dep
+absl_random_internal_randen_hwaes_impl = absl_random_dep
+absl_random_internal_randen_slow = absl_random_dep
+absl_random_internal_seed_material = absl_random_dep
+absl_random_random = absl_random_dep
+absl_random_seed_gen_exception = absl_random_dep
+absl_random_seed_sequences = absl_random_dep
+absl_raw_hash_set = absl_container_dep
+absl_raw_logging_internal = absl_base_dep
+absl_scoped_set_env = absl_base_dep
+absl_span = absl_types_dep
+absl_spinlock_wait = absl_base_dep
+absl_stacktrace = absl_debugging_dep
+absl_statusor = absl_status_dep
+absl_str_format = absl_strings_dep
+absl_str_format_internal = absl_strings_dep
+absl_strerror = absl_base_dep
+absl_string_view = absl_strings_dep
+absl_strings_internal = absl_strings_dep
+absl_symbolize = absl_debugging_dep
+absl_throw_delegate = absl_base_dep
+absl_time_zone = absl_time_dep
+absl_type_traits = absl_base_dep
+absl_utility = absl_base_dep
+absl_variant = absl_types_dep
diff --git a/subprojects/cutlass.wrap b/subprojects/cutlass.wrap
new file mode 100644
index 0000000000..9b2e897962
--- /dev/null
+++ b/subprojects/cutlass.wrap
@@ -0,0 +1,7 @@
+[wrap-git]
+url = https://github.com/NVIDIA/cutlass.git
+revision = v2.11.0
+
+patch_directory = cutlass
+
+
diff --git a/subprojects/eigen.wrap b/subprojects/eigen.wrap
index e46839c90b..becc4767c7 100644
--- a/subprojects/eigen.wrap
+++ b/subprojects/eigen.wrap
@@ -3,10 +3,11 @@ directory = eigen-3.4.0
 source_url = https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.tar.bz2
 source_filename = eigen-3.4.0.tar.bz2
 source_hash = b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626
-patch_filename = eigen_3.4.0-1_patch.zip
-patch_url = https://wrapdb.mesonbuild.com/v2/eigen_3.4.0-1/get_patch
-patch_hash = fae999acdb3ea23eada3becdbde7f7f76755e94ad85fee7775b7ab1cf12e84e3
+patch_filename = eigen_3.4.0-2_patch.zip
+patch_url = https://wrapdb.mesonbuild.com/v2/eigen_3.4.0-2/get_patch
+patch_hash = cb764fd9fec02d94aaa2ec673d473793c0d05da4f4154c142f76ef923ea68178
+source_fallback_url = https://github.com/mesonbuild/wrapdb/releases/download/eigen_3.4.0-2/eigen-3.4.0.tar.bz2
+wrapdb_version = 3.4.0-2
 
 [provide]
 eigen3 = eigen_dep
-
diff --git a/subprojects/gtest.wrap b/subprojects/gtest.wrap
index ca5d699ec0..ba73d9662e 100644
--- a/subprojects/gtest.wrap
+++ b/subprojects/gtest.wrap
@@ -1,10 +1,16 @@
 [wrap-file]
-directory = googletest-release-1.10.0
+directory = googletest-1.15.2
+source_url = https://github.com/google/googletest/archive/refs/tags/v1.15.2.tar.gz
+source_filename = gtest-1.15.2.tar.gz
+source_hash = 7b42b4d6ed48810c5362c265a17faebe90dc2373c885e5216439d37927f02926
+patch_filename = gtest_1.15.2-4_patch.zip
+patch_url = https://wrapdb.mesonbuild.com/v2/gtest_1.15.2-4/get_patch
+patch_hash = a5151324b97e6a98fa7a0e8095523e6d5c4bb3431210d6ac4ad9800c345acf40
+source_fallback_url = https://github.com/mesonbuild/wrapdb/releases/download/gtest_1.15.2-4/gtest-1.15.2.tar.gz
+wrapdb_version = 1.15.2-4
 
-source_url = https://github.com/google/googletest/archive/release-1.10.0.zip
-source_filename = gtest-1.10.0.zip
-source_hash = 94c634d499558a76fa649edb13721dce6e98fb1e7018dfaeba3cd7a083945e91
-
-patch_url = https://wrapdb.mesonbuild.com/v1/projects/gtest/1.10.0/1/get_zip
-patch_filename = gtest-1.10.0-1-wrap.zip
-patch_hash = 04ff14e8880e4e465f6260221e9dfd56fea6bc7cce4c4aff0dc528e4a2c8f514
+[provide]
+gtest = gtest_dep
+gtest_main = gtest_main_dep
+gmock = gmock_dep
+gmock_main = gmock_main_dep
diff --git a/subprojects/packagefiles/abseil-cpp-20240722.0/LICENSE.build b/subprojects/packagefiles/abseil-cpp-20240722.0/LICENSE.build
new file mode 100644
index 0000000000..b59833dedb
--- /dev/null
+++ b/subprojects/packagefiles/abseil-cpp-20240722.0/LICENSE.build
@@ -0,0 +1,19 @@
+Copyright (c) 2021 The Meson development team
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/subprojects/packagefiles/abseil-cpp-20240722.0/absl/base/internal/per_thread_tls.h b/subprojects/packagefiles/abseil-cpp-20240722.0/absl/base/internal/per_thread_tls.h
new file mode 100644
index 0000000000..441a8cb0f0
--- /dev/null
+++ b/subprojects/packagefiles/abseil-cpp-20240722.0/absl/base/internal/per_thread_tls.h
@@ -0,0 +1,52 @@
+// Copyright 2017 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ABSL_BASE_INTERNAL_PER_THREAD_TLS_H_
+#define ABSL_BASE_INTERNAL_PER_THREAD_TLS_H_
+
+// This header defines two macros:
+//
+// If the platform supports thread-local storage:
+//
+// * ABSL_PER_THREAD_TLS_KEYWORD is the C keyword needed to declare a
+//   thread-local variable
+// * ABSL_PER_THREAD_TLS is 1
+//
+// Otherwise:
+//
+// * ABSL_PER_THREAD_TLS_KEYWORD is empty
+// * ABSL_PER_THREAD_TLS is 0
+//
+// Microsoft C supports thread-local storage.
+// GCC supports it if the appropriate version of glibc is available,
+// which the programmer can indicate by defining ABSL_HAVE_TLS
+
+#include "absl/base/port.h"  // For ABSL_HAVE_TLS
+
+#if defined(ABSL_PER_THREAD_TLS)
+#error ABSL_PER_THREAD_TLS cannot be directly set
+#elif defined(ABSL_PER_THREAD_TLS_KEYWORD)
+#error ABSL_PER_THREAD_TLS_KEYWORD cannot be directly set
+#elif defined(ABSL_HAVE_TLS) || defined(__INTEL_LLVM_COMPILER)
+#define ABSL_PER_THREAD_TLS_KEYWORD __thread
+#define ABSL_PER_THREAD_TLS 1
+#elif defined(_MSC_VER)
+#define ABSL_PER_THREAD_TLS_KEYWORD __declspec(thread)
+#define ABSL_PER_THREAD_TLS 1
+#else
+#define ABSL_PER_THREAD_TLS_KEYWORD
+#define ABSL_PER_THREAD_TLS 0
+#endif
+
+#endif  // ABSL_BASE_INTERNAL_PER_THREAD_TLS_H_
diff --git a/subprojects/packagefiles/abseil-cpp-20240722.0/meson.build b/subprojects/packagefiles/abseil-cpp-20240722.0/meson.build
new file mode 100644
index 0000000000..9848e5b33b
--- /dev/null
+++ b/subprojects/packagefiles/abseil-cpp-20240722.0/meson.build
@@ -0,0 +1,906 @@
+project(
+  'abseil-cpp',
+  'cpp',
+  version: '20240722.0',
+  license: 'Apache-2.0',
+  default_options: [
+    'cpp_std=c++17',
+  ],
+)
+
+cpp = meson.get_compiler('cpp')
+
+flags = cpp.get_supported_arguments('/DNOMINMAX', '-Wno-sign-compare', '-Wno-gcc-compat')
+add_project_arguments(flags, language: 'cpp')
+
+arch_cpp_flags = []
+hw_cpp_flags = []
+if host_machine.cpu_family() == 'x86_64'
+  hw_cpp_flags += ['-maes', '-msse4.1']
+elif host_machine.cpu_family() == 'aarch64' and cpp.sizeof('void*') == 8
+  hw_cpp_flags += ['-march=armv8-a+crypto']
+elif host_machine.cpu_family() == 'arm' and cpp.sizeof('void*') == 4
+  hw_cpp_flags += ['-mfpu=neon']
+elif host_machine.cpu_family() == 'ppc' or host_machine.cpu_family() == 'ppc64'
+  # This will work with glibc but not musl
+  timebase_check = '''#include <sys/platform/ppc.h>
+    int main() {
+       __ppc_get_timebase_freq();
+       return 0;
+    }'''
+  if not cpp.compiles(timebase_check)
+    arch_cpp_flags += ['-DABSL_USE_UNSCALED_CYCLECLOCK=0']
+  endif
+endif
+arch_flags = cpp.get_supported_arguments(arch_cpp_flags)
+hw_flags = cpp.get_supported_arguments(hw_cpp_flags)
+
+libatomic = dependency('', required: false)
+if cpp.get_argument_syntax() != 'msvc' and not cpp.links('int main(){__sync_synchronize();}', name: 'atomic builtins')
+  libatomic = cpp.find_library('atomic')
+endif
+
+absl_include_dir = include_directories('.')
+
+# Group files by the containing library
+absl_base_sources = files(
+  'absl/base/internal/cycleclock.cc',
+  'absl/base/internal/low_level_alloc.cc',
+  'absl/base/internal/poison.cc',
+  'absl/base/internal/raw_logging.cc',
+  'absl/base/internal/scoped_set_env.cc',
+  'absl/base/internal/spinlock.cc',
+  'absl/base/internal/spinlock_wait.cc',
+  'absl/base/internal/strerror.cc',
+  'absl/base/internal/sysinfo.cc',
+  'absl/base/internal/thread_identity.cc',
+  'absl/base/internal/throw_delegate.cc',
+  'absl/base/internal/unscaledcycleclock.cc',
+  'absl/base/log_severity.cc',
+)
+absl_base_headers = files(
+  'absl/base/attributes.h',
+  'absl/base/call_once.h',
+  'absl/base/casts.h',
+  'absl/base/config.h',
+  'absl/base/const_init.h',
+  'absl/base/dynamic_annotations.h',
+  'absl/base/internal/atomic_hook.h',
+  'absl/base/internal/atomic_hook_test_helper.h',
+  'absl/base/internal/cycleclock.h',
+  'absl/base/internal/cycleclock_config.h',
+  'absl/base/internal/direct_mmap.h',
+  'absl/base/internal/dynamic_annotations.h',
+  'absl/base/internal/endian.h',
+  'absl/base/internal/errno_saver.h',
+  'absl/base/internal/exception_safety_testing.h',
+  'absl/base/internal/exception_testing.h',
+  'absl/base/internal/fast_type_id.h',
+  'absl/base/internal/hide_ptr.h',
+  'absl/base/internal/identity.h',
+  'absl/base/internal/inline_variable.h',
+  'absl/base/internal/inline_variable_testing.h',
+  'absl/base/internal/invoke.h',
+  'absl/base/internal/low_level_alloc.h',
+  'absl/base/internal/low_level_scheduling.h',
+  'absl/base/internal/nullability_impl.h',
+  'absl/base/internal/per_thread_tls.h',
+  'absl/base/internal/poison.h',
+  'absl/base/internal/pretty_function.h',
+  'absl/base/internal/raw_logging.h',
+  'absl/base/internal/scheduling_mode.h',
+  'absl/base/internal/scoped_set_env.h',
+  'absl/base/internal/spinlock.h',
+  'absl/base/internal/spinlock_akaros.inc',
+  'absl/base/internal/spinlock_linux.inc',
+  'absl/base/internal/spinlock_posix.inc',
+  'absl/base/internal/spinlock_wait.h',
+  'absl/base/internal/spinlock_win32.inc',
+  'absl/base/internal/strerror.h',
+  'absl/base/internal/sysinfo.h',
+  'absl/base/internal/thread_identity.h',
+  'absl/base/internal/throw_delegate.h',
+  'absl/base/internal/tsan_mutex_interface.h',
+  'absl/base/internal/unaligned_access.h',
+  'absl/base/internal/unscaledcycleclock.h',
+  'absl/base/internal/unscaledcycleclock_config.h',
+  'absl/base/log_severity.h',
+  'absl/base/macros.h',
+  'absl/base/no_destructor.h',
+  'absl/base/nullability.h',
+  'absl/base/optimization.h',
+  'absl/base/options.h',
+  'absl/base/policy_checks.h',
+  'absl/base/port.h',
+  'absl/base/thread_annotations.h',
+  'absl/functional/any_invocable.h',
+  'absl/functional/internal/any_invocable.h',
+  'absl/memory/memory.h',
+  'absl/meta/type_traits.h',
+  'absl/utility/utility.h',
+  # Dependent headers of absl_base
+)
+
+absl_container_sources = files(
+  'absl/container/internal/hashtablez_sampler.cc',
+  'absl/container/internal/hashtablez_sampler_force_weak_definition.cc',
+  'absl/container/internal/raw_hash_set.cc',
+)
+absl_container_headers = files(
+  'absl/container/btree_map.h',
+  'absl/container/btree_set.h',
+  'absl/container/hash_container_defaults.h',
+  'absl/container/btree_test.h',
+  'absl/container/fixed_array.h',
+  'absl/container/flat_hash_map.h',
+  'absl/container/flat_hash_set.h',
+  'absl/container/inlined_vector.h',
+  'absl/container/internal/btree.h',
+  'absl/container/internal/btree_container.h',
+  'absl/container/internal/common.h',
+  'absl/container/internal/common_policy_traits.h',
+  'absl/container/internal/compressed_tuple.h',
+  'absl/container/internal/container_memory.h',
+  'absl/container/internal/hash_function_defaults.h',
+  'absl/container/internal/hash_generator_testing.h',
+  'absl/container/internal/hash_policy_testing.h',
+  'absl/container/internal/hash_policy_traits.h',
+  'absl/container/internal/hashtable_debug.h',
+  'absl/container/internal/hashtable_debug_hooks.h',
+  'absl/container/internal/hashtablez_sampler.h',
+  'absl/container/internal/inlined_vector.h',
+  'absl/container/internal/layout.h',
+  'absl/container/internal/node_slot_policy.h',
+  'absl/container/internal/raw_hash_map.h',
+  'absl/container/internal/raw_hash_set.h',
+  'absl/container/internal/test_instance_tracker.h',
+  'absl/container/internal/tracked.h',
+  'absl/container/internal/unordered_map_constructor_test.h',
+  'absl/container/internal/unordered_map_lookup_test.h',
+  'absl/container/internal/unordered_map_members_test.h',
+  'absl/container/internal/unordered_map_modifiers_test.h',
+  'absl/container/internal/unordered_set_constructor_test.h',
+  'absl/container/internal/unordered_set_lookup_test.h',
+  'absl/container/internal/unordered_set_members_test.h',
+  'absl/container/internal/unordered_set_modifiers_test.h',
+  'absl/container/node_hash_map.h',
+  'absl/container/node_hash_set.h',
+)
+
+absl_crc_sources = files(
+  'absl/crc/crc32c.cc',
+  'absl/crc/internal/cpu_detect.cc',
+  'absl/crc/internal/crc.cc',
+  'absl/crc/internal/crc_cord_state.cc',
+  'absl/crc/internal/crc_memcpy_fallback.cc',
+  'absl/crc/internal/crc_memcpy_x86_arm_combined.cc',
+  'absl/crc/internal/crc_non_temporal_memcpy.cc',
+  'absl/crc/internal/crc_x86_arm_combined.cc',
+)
+absl_crc_headers = files(
+  'absl/crc/crc32c.h',
+  'absl/crc/internal/cpu_detect.h',
+  'absl/crc/internal/crc.h',
+  'absl/crc/internal/crc32_x86_arm_combined_simd.h',
+  'absl/crc/internal/crc32c.h',
+  'absl/crc/internal/crc32c_inline.h',
+  'absl/crc/internal/crc_cord_state.h',
+  'absl/crc/internal/crc_internal.h',
+  'absl/crc/internal/crc_memcpy.h',
+  'absl/crc/internal/non_temporal_arm_intrinsics.h',
+  'absl/crc/internal/non_temporal_memcpy.h',
+)
+
+absl_debugging_sources = files(
+  'absl/debugging/failure_signal_handler.cc',
+  'absl/debugging/internal/address_is_readable.cc',
+  'absl/debugging/internal/decode_rust_punycode.cc',
+  'absl/debugging/internal/demangle.cc',
+  'absl/debugging/internal/demangle_rust.cc',
+  'absl/debugging/internal/elf_mem_image.cc',
+  'absl/debugging/internal/examine_stack.cc',
+  'absl/debugging/internal/stack_consumption.cc',
+  'absl/debugging/internal/vdso_support.cc',
+  'absl/debugging/leak_check.cc',
+  'absl/debugging/stacktrace.cc',
+  'absl/debugging/symbolize.cc',
+  'absl/debugging/internal/utf8_for_code_point.cc',
+)
+absl_debugging_headers = files(
+  'absl/debugging/failure_signal_handler.h',
+  'absl/debugging/internal/address_is_readable.h',
+  'absl/debugging/internal/bounded_utf8_length_sequence.h',
+  'absl/debugging/internal/decode_rust_punycode.h',
+  'absl/debugging/internal/demangle.h',
+  'absl/debugging/internal/demangle_rust.h',
+  'absl/debugging/internal/elf_mem_image.h',
+  'absl/debugging/internal/examine_stack.h',
+  'absl/debugging/internal/stack_consumption.h',
+  'absl/debugging/internal/stacktrace_aarch64-inl.inc',
+  'absl/debugging/internal/stacktrace_arm-inl.inc',
+  'absl/debugging/internal/stacktrace_config.h',
+  'absl/debugging/internal/stacktrace_emscripten-inl.inc',
+  'absl/debugging/internal/stacktrace_generic-inl.inc',
+  'absl/debugging/internal/stacktrace_powerpc-inl.inc',
+  'absl/debugging/internal/stacktrace_riscv-inl.inc',
+  'absl/debugging/internal/stacktrace_unimplemented-inl.inc',
+  'absl/debugging/internal/stacktrace_win32-inl.inc',
+  'absl/debugging/internal/stacktrace_x86-inl.inc',
+  'absl/debugging/internal/symbolize.h',
+  'absl/debugging/internal/utf8_for_code_point.h',
+  'absl/debugging/internal/vdso_support.h',
+  'absl/debugging/leak_check.h',
+  'absl/debugging/stacktrace.h',
+  'absl/debugging/symbolize.h',
+  'absl/debugging/symbolize_darwin.inc',
+  'absl/debugging/symbolize_elf.inc',
+  'absl/debugging/symbolize_emscripten.inc',
+  'absl/debugging/symbolize_unimplemented.inc',
+  'absl/debugging/symbolize_win32.inc',
+)
+
+absl_flags_sources = files(
+  'absl/flags/commandlineflag.cc',
+  'absl/flags/internal/flag.cc',
+  'absl/flags/internal/commandlineflag.cc',
+  'absl/flags/internal/flag.cc',
+  'absl/flags/internal/private_handle_accessor.cc',
+  'absl/flags/internal/program_name.cc',
+  'absl/flags/internal/usage.cc',
+  'absl/flags/marshalling.cc',
+  'absl/flags/parse.cc',
+  'absl/flags/reflection.cc',
+  'absl/flags/usage.cc',
+  'absl/flags/usage_config.cc',
+)
+absl_flags_headers = files(
+  'absl/flags/commandlineflag.h',
+  'absl/flags/config.h',
+  'absl/flags/declare.h',
+  'absl/flags/internal/flag.h',
+  'absl/flags/internal/commandlineflag.h',
+  'absl/flags/internal/flag.h',
+  'absl/flags/internal/parse.h',
+  'absl/flags/internal/path_util.h',
+  'absl/flags/internal/private_handle_accessor.h',
+  'absl/flags/internal/program_name.h',
+  'absl/flags/internal/registry.h',
+  'absl/flags/internal/sequence_lock.h',
+  'absl/flags/internal/usage.h',
+  'absl/flags/marshalling.h',
+  'absl/flags/parse.h',
+  'absl/flags/reflection.h',
+  'absl/flags/usage.h',
+  'absl/flags/usage_config.h',
+)
+
+absl_hash_sources = files(
+  'absl/hash/internal/city.cc',
+  'absl/hash/internal/hash.cc',
+  'absl/hash/internal/low_level_hash.cc',
+)
+absl_hash_headers = files(
+  'absl/hash/hash.h',
+  'absl/hash/hash_testing.h',
+  'absl/hash/internal/city.h',
+  'absl/hash/internal/hash.h',
+  'absl/hash/internal/low_level_hash.h',
+  'absl/hash/internal/spy_hash_state.h',
+)
+
+absl_log_sources = files(
+  'absl/log/die_if_null.cc',
+  'absl/log/flags.cc',
+  'absl/log/globals.cc',
+  'absl/log/initialize.cc',
+  'absl/log/internal/check_op.cc',
+  'absl/log/internal/conditions.cc',
+  'absl/log/internal/fnmatch.cc',
+  'absl/log/internal/globals.cc',
+  'absl/log/internal/log_format.cc',
+  'absl/log/internal/log_message.cc',
+  'absl/log/internal/log_sink_set.cc',
+  'absl/log/internal/nullguard.cc',
+  'absl/log/internal/proto.cc',
+  'absl/log/internal/vlog_config.cc',
+  'absl/log/log_entry.cc',
+  'absl/log/log_sink.cc',
+)
+absl_log_headers = files(
+  'absl/log/absl_check.h',
+  'absl/log/absl_log.h',
+  'absl/log/check.h',
+  'absl/log/die_if_null.h',
+  'absl/log/flags.h',
+  'absl/log/globals.h',
+  'absl/log/initialize.h',
+  'absl/log/internal/append_truncated.h',
+  'absl/log/internal/check_impl.h',
+  'absl/log/internal/check_op.h',
+  'absl/log/internal/conditions.h',
+  'absl/log/internal/config.h',
+  'absl/log/internal/fnmatch.h',
+  'absl/log/internal/flags.h',
+  'absl/log/internal/globals.h',
+  'absl/log/internal/log_format.h',
+  'absl/log/internal/log_impl.h',
+  'absl/log/internal/log_message.h',
+  'absl/log/internal/log_sink_set.h',
+  'absl/log/internal/nullguard.h',
+  'absl/log/internal/nullstream.h',
+  'absl/log/internal/proto.h',
+  'absl/log/internal/strip.h',
+  'absl/log/internal/structured.h',
+  'absl/log/internal/test_actions.h',
+  'absl/log/internal/test_helpers.h',
+  'absl/log/internal/test_matchers.h',
+  'absl/log/internal/vlog_config.h',
+  'absl/log/internal/voidify.h',
+  'absl/log/log.h',
+  'absl/log/log_entry.h',
+  'absl/log/log_sink.h',
+  'absl/log/log_sink_registry.h',
+  'absl/log/log_streamer.h',
+  'absl/log/scoped_mock_log.h',
+  'absl/log/structured.h',
+)
+
+absl_numeric_sources = files(
+  'absl/numeric/int128.cc',
+)
+absl_numeric_headers = files(
+  'absl/numeric/bits.h',
+  'absl/numeric/int128.h',
+  'absl/numeric/int128_have_intrinsic.inc',
+  'absl/numeric/int128_no_intrinsic.inc',
+  'absl/numeric/internal/bits.h',
+  'absl/numeric/internal/representation.h',
+)
+
+absl_profiling_sources = files(
+  'absl/profiling/internal/exponential_biased.cc',
+  'absl/profiling/internal/periodic_sampler.cc',
+)
+absl_profiling_headers = files(
+  'absl/profiling/internal/exponential_biased.h',
+  'absl/profiling/internal/periodic_sampler.h',
+  'absl/profiling/internal/sample_recorder.h',
+)
+
+absl_random_sources = files(
+  'absl/random/discrete_distribution.cc',
+  'absl/random/gaussian_distribution.cc',
+  'absl/random/internal/chi_square.cc',
+  'absl/random/internal/pool_urbg.cc',
+  'absl/random/internal/randen.cc',
+  'absl/random/internal/randen_detect.cc',
+  'absl/random/internal/randen_hwaes.cc',
+  'absl/random/internal/randen_round_keys.cc',
+  'absl/random/internal/randen_slow.cc',
+  'absl/random/internal/seed_material.cc',
+  'absl/random/seed_gen_exception.cc',
+  'absl/random/seed_sequences.cc',
+)
+absl_random_headers = files(
+  'absl/random/bernoulli_distribution.h',
+  'absl/random/beta_distribution.h',
+  'absl/random/bit_gen_ref.h',
+  'absl/random/discrete_distribution.h',
+  'absl/random/distributions.h',
+  'absl/random/exponential_distribution.h',
+  'absl/random/gaussian_distribution.h',
+  'absl/random/internal/chi_square.h',
+  'absl/random/internal/distribution_caller.h',
+  'absl/random/internal/distribution_test_util.h',
+  'absl/random/internal/explicit_seed_seq.h',
+  'absl/random/internal/fast_uniform_bits.h',
+  'absl/random/internal/fastmath.h',
+  'absl/random/internal/generate_real.h',
+  'absl/random/internal/iostream_state_saver.h',
+  'absl/random/internal/mock_helpers.h',
+  'absl/random/internal/mock_overload_set.h',
+  'absl/random/internal/nanobenchmark.h',
+  'absl/random/internal/nonsecure_base.h',
+  'absl/random/internal/pcg_engine.h',
+  'absl/random/internal/platform.h',
+  'absl/random/internal/pool_urbg.h',
+  'absl/random/internal/randen.h',
+  'absl/random/internal/randen_detect.h',
+  'absl/random/internal/randen_engine.h',
+  'absl/random/internal/randen_hwaes.h',
+  'absl/random/internal/randen_slow.h',
+  'absl/random/internal/randen_traits.h',
+  'absl/random/internal/salted_seed_seq.h',
+  'absl/random/internal/seed_material.h',
+  'absl/random/internal/sequence_urbg.h',
+  'absl/random/internal/traits.h',
+  'absl/random/internal/uniform_helper.h',
+  'absl/random/internal/wide_multiply.h',
+  'absl/random/log_uniform_int_distribution.h',
+  'absl/random/mock_distributions.h',
+  'absl/random/mocking_bit_gen.h',
+  'absl/random/poisson_distribution.h',
+  'absl/random/random.h',
+  'absl/random/seed_gen_exception.h',
+  'absl/random/seed_sequences.h',
+  'absl/random/uniform_int_distribution.h',
+  'absl/random/uniform_real_distribution.h',
+  'absl/random/zipf_distribution.h',
+)
+
+absl_status_sources = files(
+  'absl/status/internal/status_internal.cc',
+  'absl/status/status.cc',
+  'absl/status/status_payload_printer.cc',
+  'absl/status/statusor.cc',
+)
+absl_status_headers = files(
+  'absl/status/internal/status_internal.h',
+  'absl/status/internal/statusor_internal.h',
+  'absl/status/status.h',
+  'absl/status/status_payload_printer.h',
+  'absl/status/statusor.h',
+)
+
+absl_strings_sources = files(
+  'absl/strings/ascii.cc',
+  'absl/strings/charconv.cc',
+  'absl/strings/cord.cc',
+  'absl/strings/cord_analysis.cc',
+  'absl/strings/cord_buffer.cc',
+  'absl/strings/escaping.cc',
+  'absl/strings/internal/charconv_bigint.cc',
+  'absl/strings/internal/charconv_parse.cc',
+  'absl/strings/internal/cord_internal.cc',
+  'absl/strings/internal/cord_rep_btree.cc',
+  'absl/strings/internal/cord_rep_btree_navigator.cc',
+  'absl/strings/internal/cord_rep_btree_reader.cc',
+  'absl/strings/internal/cord_rep_consume.cc',
+  'absl/strings/internal/cord_rep_crc.cc',
+  'absl/strings/internal/cordz_functions.cc',
+  'absl/strings/internal/cordz_handle.cc',
+  'absl/strings/internal/cordz_info.cc',
+  'absl/strings/internal/cordz_sample_token.cc',
+  'absl/strings/internal/damerau_levenshtein_distance.cc',
+  'absl/strings/internal/escaping.cc',
+  'absl/strings/internal/memutil.cc',
+  'absl/strings/internal/ostringstream.cc',
+  'absl/strings/internal/pow10_helper.cc',
+  'absl/strings/internal/str_format/arg.cc',
+  'absl/strings/internal/str_format/bind.cc',
+  'absl/strings/internal/str_format/extension.cc',
+  'absl/strings/internal/str_format/float_conversion.cc',
+  'absl/strings/internal/str_format/output.cc',
+  'absl/strings/internal/str_format/parser.cc',
+  'absl/strings/internal/stringify_sink.cc',
+  'absl/strings/internal/utf8.cc',
+  'absl/strings/match.cc',
+  'absl/strings/numbers.cc',
+  'absl/strings/str_cat.cc',
+  'absl/strings/str_replace.cc',
+  'absl/strings/str_split.cc',
+  'absl/strings/string_view.cc',
+  'absl/strings/substitute.cc',
+)
+absl_strings_headers = files(
+  'absl/strings/ascii.h',
+  'absl/strings/charconv.h',
+  'absl/strings/cord.h',
+  'absl/strings/cord_analysis.h',
+  'absl/strings/cord_buffer.h',
+  'absl/strings/cord_test_helpers.h',
+  'absl/strings/cordz_test_helpers.h',
+  'absl/strings/escaping.h',
+  'absl/strings/has_absl_stringify.h',
+  'absl/strings/internal/charconv_bigint.h',
+  'absl/strings/internal/charconv_parse.h',
+  'absl/strings/internal/cord_data_edge.h',
+  'absl/strings/internal/cord_internal.h',
+  'absl/strings/internal/cord_rep_btree.h',
+  'absl/strings/internal/cord_rep_btree_navigator.h',
+  'absl/strings/internal/cord_rep_btree_reader.h',
+  'absl/strings/internal/cord_rep_consume.h',
+  'absl/strings/internal/cord_rep_crc.h',
+  'absl/strings/internal/cord_rep_flat.h',
+  'absl/strings/internal/cord_rep_test_util.h',
+  'absl/strings/internal/cordz_functions.h',
+  'absl/strings/internal/cordz_handle.h',
+  'absl/strings/internal/cordz_info.h',
+  'absl/strings/internal/cordz_sample_token.h',
+  'absl/strings/internal/cordz_statistics.h',
+  'absl/strings/internal/cordz_update_scope.h',
+  'absl/strings/internal/cordz_update_tracker.h',
+  'absl/strings/internal/damerau_levenshtein_distance.h',
+  'absl/strings/internal/escaping.h',
+  'absl/strings/internal/escaping_test_common.h',
+  'absl/strings/internal/memutil.h',
+  'absl/strings/internal/numbers_test_common.h',
+  'absl/strings/internal/ostringstream.h',
+  'absl/strings/internal/pow10_helper.h',
+  'absl/strings/internal/resize_uninitialized.h',
+  'absl/strings/internal/stl_type_traits.h',
+  'absl/strings/internal/str_format/arg.h',
+  'absl/strings/internal/str_format/bind.h',
+  'absl/strings/internal/str_format/checker.h',
+  'absl/strings/internal/str_format/constexpr_parser.h',
+  'absl/strings/internal/str_format/extension.h',
+  'absl/strings/internal/str_format/float_conversion.h',
+  'absl/strings/internal/str_format/output.h',
+  'absl/strings/internal/str_format/parser.h',
+  'absl/strings/internal/str_join_internal.h',
+  'absl/strings/internal/str_split_internal.h',
+  'absl/strings/internal/string_constant.h',
+  'absl/strings/internal/stringify_sink.h',
+  'absl/strings/internal/utf8.h',
+  'absl/strings/match.h',
+  'absl/strings/numbers.h',
+  'absl/strings/str_cat.h',
+  'absl/strings/str_format.h',
+  'absl/strings/str_join.h',
+  'absl/strings/str_replace.h',
+  'absl/strings/str_split.h',
+  'absl/strings/string_view.h',
+  'absl/strings/strip.h',
+  'absl/strings/substitute.h',
+)
+
+absl_synchronization_sources = files(
+  'absl/synchronization/barrier.cc',
+  'absl/synchronization/blocking_counter.cc',
+  'absl/synchronization/internal/create_thread_identity.cc',
+  'absl/synchronization/internal/futex_waiter.cc',
+  'absl/synchronization/internal/graphcycles.cc',
+  'absl/synchronization/internal/kernel_timeout.cc',
+  'absl/synchronization/internal/per_thread_sem.cc',
+  'absl/synchronization/internal/pthread_waiter.cc',
+  'absl/synchronization/internal/sem_waiter.cc',
+  'absl/synchronization/internal/stdcpp_waiter.cc',
+  'absl/synchronization/internal/waiter_base.cc',
+  'absl/synchronization/internal/win32_waiter.cc',
+  'absl/synchronization/mutex.cc',
+  'absl/synchronization/notification.cc',
+)
+absl_synchronization_headers = files(
+  'absl/synchronization/barrier.h',
+  'absl/synchronization/blocking_counter.h',
+  'absl/synchronization/internal/create_thread_identity.h',
+  'absl/synchronization/internal/futex.h',
+  'absl/synchronization/internal/graphcycles.h',
+  'absl/synchronization/internal/kernel_timeout.h',
+  'absl/synchronization/internal/per_thread_sem.h',
+  'absl/synchronization/internal/thread_pool.h',
+  'absl/synchronization/internal/waiter.h',
+  'absl/synchronization/mutex.h',
+  'absl/synchronization/notification.h',
+)
+
+absl_time_sources = files(
+  'absl/time/civil_time.cc',
+  'absl/time/clock.cc',
+  'absl/time/duration.cc',
+  'absl/time/format.cc',
+  'absl/time/internal/cctz/src/civil_time_detail.cc',
+  'absl/time/internal/cctz/src/time_zone_fixed.cc',
+  'absl/time/internal/cctz/src/time_zone_format.cc',
+  'absl/time/internal/cctz/src/time_zone_if.cc',
+  'absl/time/internal/cctz/src/time_zone_impl.cc',
+  'absl/time/internal/cctz/src/time_zone_info.cc',
+  'absl/time/internal/cctz/src/time_zone_libc.cc',
+  'absl/time/internal/cctz/src/time_zone_lookup.cc',
+  'absl/time/internal/cctz/src/time_zone_posix.cc',
+  'absl/time/internal/cctz/src/zone_info_source.cc',
+  'absl/time/time.cc',
+)
+absl_time_headers = files(
+  'absl/time/civil_time.h',
+  'absl/time/clock.h',
+  'absl/time/internal/cctz/include/cctz/civil_time.h',
+  'absl/time/internal/cctz/include/cctz/civil_time_detail.h',
+  'absl/time/internal/cctz/include/cctz/time_zone.h',
+  'absl/time/internal/cctz/include/cctz/zone_info_source.h',
+  'absl/time/internal/cctz/src/time_zone_fixed.h',
+  'absl/time/internal/cctz/src/time_zone_if.h',
+  'absl/time/internal/cctz/src/time_zone_impl.h',
+  'absl/time/internal/cctz/src/time_zone_info.h',
+  'absl/time/internal/cctz/src/time_zone_libc.h',
+  'absl/time/internal/cctz/src/time_zone_posix.h',
+  'absl/time/internal/cctz/src/tzfile.h',
+  'absl/time/internal/get_current_time_chrono.inc',
+  'absl/time/internal/get_current_time_posix.inc',
+  'absl/time/internal/test_util.h',
+  'absl/time/time.h',
+)
+
+absl_types_sources = files(
+  'absl/types/bad_any_cast.cc',
+  'absl/types/bad_optional_access.cc',
+  'absl/types/bad_variant_access.cc',
+)
+absl_types_headers = files(
+  'absl/types/any.h',
+  'absl/types/bad_any_cast.h',
+  'absl/types/bad_optional_access.h',
+  'absl/types/bad_variant_access.h',
+  'absl/types/compare.h',
+  'absl/types/internal/optional.h',
+  'absl/types/internal/span.h',
+  'absl/types/internal/variant.h',
+  'absl/types/optional.h',
+  'absl/types/span.h',
+  'absl/types/variant.h',
+)
+
+# Libraries
+absl_base_lib = static_library(
+  'absl_base',
+  absl_base_sources,
+  include_directories: absl_include_dir,
+  cpp_args: arch_flags,
+  dependencies: [dependency('threads'), libatomic],
+)
+
+absl_hash_lib = static_library(
+  'absl_hash',
+  absl_hash_sources,
+  include_directories: absl_include_dir,
+)
+
+absl_numeric_lib = static_library(
+  'absl_numeric',
+  absl_numeric_sources,
+  include_directories: absl_include_dir,
+)
+
+absl_profiling_lib = static_library(
+  'absl_profiling',
+  absl_profiling_sources,
+  include_directories: absl_include_dir,
+)
+
+absl_crc_lib = static_library(
+  'absl_crc',
+  absl_crc_sources,
+  include_directories: absl_include_dir,
+  link_with: [
+    absl_base_lib,
+  ],
+  dependencies: libatomic,
+)
+
+absl_strings_lib = static_library(
+  'absl_strings',
+  absl_strings_sources,
+  include_directories: absl_include_dir,
+  link_with: [
+    absl_base_lib,
+    absl_crc_lib,
+    absl_numeric_lib,
+    absl_profiling_lib,
+  ],
+)
+
+absl_debugging_lib = static_library(
+  'absl_debugging',
+  absl_debugging_sources,
+  include_directories: absl_include_dir,
+  link_with: [
+    absl_base_lib,
+    absl_strings_lib,
+  ],
+  dependencies: libatomic,
+)
+
+absl_random_lib = static_library(
+  'absl_random',
+  absl_random_sources,
+  include_directories: absl_include_dir,
+  cpp_args: hw_flags,
+  link_with: [
+    absl_base_lib,
+    absl_strings_lib,
+  ],
+  dependencies: libatomic,
+)
+
+absl_time_lib = static_library(
+  'absl_time',
+  absl_time_sources,
+  include_directories: absl_include_dir,
+  link_with: [
+    absl_base_lib,
+    absl_numeric_lib,
+    absl_strings_lib,
+  ],
+  # macOS only, upstream: https://github.com/abseil/abseil-cpp/pull/280
+  dependencies: dependency('appleframeworks', modules: 'CoreFoundation', required: host_machine.system() == 'darwin'),
+)
+
+absl_types_lib = static_library(
+  'absl_types',
+  absl_types_sources,
+  include_directories: absl_include_dir,
+)
+
+absl_synchronization_lib = static_library(
+  'absl_synchronization',
+  absl_synchronization_sources,
+  include_directories: absl_include_dir,
+  link_with: [
+    absl_base_lib,
+    absl_debugging_lib,
+    absl_time_lib,
+  ],
+)
+
+absl_container_lib = static_library(
+  'absl_container',
+  absl_container_sources,
+  include_directories: absl_include_dir,
+  link_with: [
+    absl_base_lib,
+    absl_debugging_lib,
+    absl_hash_lib,
+    absl_synchronization_lib,
+    absl_time_lib,
+  ],
+)
+
+absl_flags_lib = static_library(
+  'absl_flags',
+  absl_flags_sources,
+  include_directories: absl_include_dir,
+  link_with: [
+    absl_base_lib,
+    absl_container_lib,
+    absl_hash_lib,
+    absl_strings_lib,
+    absl_synchronization_lib,
+  ],
+  dependencies: libatomic,
+)
+
+absl_status_lib = static_library(
+  'absl_status',
+  absl_status_sources,
+  include_directories: absl_include_dir,
+  link_with: [
+    absl_base_lib,
+    absl_strings_lib,
+  ],
+)
+
+absl_log_lib = static_library(
+  'absl_log',
+  absl_log_sources,
+  include_directories: absl_include_dir,
+  link_with: [
+    absl_base_lib,
+    absl_strings_lib,
+    absl_flags_lib,
+  ],
+  dependencies: libatomic,
+)
+
+# Dependencies
+absl_base_dep = declare_dependency(
+  include_directories: absl_include_dir,
+  link_with: absl_base_lib,
+)
+
+absl_hash_dep = declare_dependency(
+  include_directories: absl_include_dir,
+  link_with: absl_hash_lib,
+)
+
+absl_numeric_dep = declare_dependency(
+  include_directories: absl_include_dir,
+  link_with: absl_numeric_lib,
+)
+
+absl_profiling_dep = declare_dependency(
+  include_directories: absl_include_dir,
+  link_with: absl_profiling_lib,
+)
+
+absl_strings_dep = declare_dependency(
+  include_directories: absl_include_dir,
+  link_with: absl_strings_lib,
+  dependencies: [
+    absl_base_dep,
+    absl_numeric_dep,
+  ],
+)
+
+absl_debugging_dep = declare_dependency(
+  include_directories: absl_include_dir,
+  link_with: absl_debugging_lib,
+  dependencies: [
+    absl_base_dep,
+    absl_strings_dep,
+  ],
+)
+
+absl_random_dep = declare_dependency(
+  include_directories: absl_include_dir,
+  link_with: absl_random_lib,
+  dependencies: [
+    absl_base_dep,
+    absl_strings_dep,
+  ],
+)
+
+absl_crc_dep = declare_dependency(
+  include_directories: absl_include_dir,
+  link_with: absl_crc_lib,
+  dependencies: [
+    absl_base_dep,
+  ],
+)
+
+absl_time_dep = declare_dependency(
+  include_directories: absl_include_dir,
+  link_with: absl_time_lib,
+  dependencies: [
+    absl_base_dep,
+    absl_numeric_dep,
+    absl_strings_dep,
+  ],
+)
+
+absl_types_dep = declare_dependency(
+  include_directories: absl_include_dir,
+  link_with: absl_types_lib,
+)
+
+absl_synchronization_dep = declare_dependency(
+  include_directories: absl_include_dir,
+  link_with: absl_synchronization_lib,
+  dependencies: [
+    absl_base_dep,
+    absl_debugging_dep,
+    absl_time_dep,
+  ],
+)
+
+absl_container_dep = declare_dependency(
+  include_directories: absl_include_dir,
+  link_with: absl_container_lib,
+  dependencies: [
+    absl_base_dep,
+    absl_debugging_dep,
+    absl_hash_dep,
+    absl_synchronization_dep,
+    absl_time_dep,
+  ],
+)
+
+absl_flags_dep = declare_dependency(
+  include_directories: absl_include_dir,
+  link_with: absl_flags_lib,
+  dependencies: [
+    absl_base_dep,
+    absl_container_dep,
+    absl_hash_dep,
+    absl_strings_dep,
+    absl_synchronization_dep,
+  ],
+)
+
+absl_log_dep = declare_dependency(
+  include_directories: absl_include_dir,
+  link_with: absl_log_lib,
+  dependencies: [
+    absl_base_dep,
+    absl_strings_dep,
+    absl_flags_dep,
+  ],
+)
+
+absl_status_dep = declare_dependency(
+  include_directories: absl_include_dir,
+  link_with: absl_status_lib,
+  dependencies: [
+    absl_base_dep,
+    absl_strings_dep,
+  ],
+)
diff --git a/subprojects/packagefiles/cutlass/meson.build b/subprojects/packagefiles/cutlass/meson.build
new file mode 100644
index 0000000000..efeb2bddf2
--- /dev/null
+++ b/subprojects/packagefiles/cutlass/meson.build
@@ -0,0 +1,4 @@
+project('cutlass', 'cpp')
+
+include_directory = meson.current_source_dir() + '/include'
+
diff --git a/subprojects/packagefiles/gaviotatb/meson.build b/subprojects/packagefiles/gaviotatb/meson.build
index d235a257d7..2a1b51bfae 100644
--- a/subprojects/packagefiles/gaviotatb/meson.build
+++ b/subprojects/packagefiles/gaviotatb/meson.build
@@ -24,11 +24,22 @@ gaviotatb_includes = [
   'compression',
   'compression/huffman',
   'compression/liblzf',
-  'compression/lzma'
+  'compression/lzma',
+  'compression/zlib'
 ]
 
+gaviota_lib = static_library('gaviota',
+  gaviotatb_src,
+  c_args : meson.get_compiler('c').get_supported_arguments([
+    '-Dz_uLong=uLong',
+    '-Wno-misleading-indentation',
+    '-Wno-self-assign',
+    '-Wno-language-extension-token',
+    '-Wno-expansion-to-defined']),
+  include_directories : gaviotatb_includes)
+
+incdir = include_directories('.')
+
 gaviotatb_dep = declare_dependency(
-  sources: gaviotatb_src,
-  include_directories: gaviotatb_includes,
-  compile_args:'-Dz_uLong=uLong'
-)
+  link_with : gaviota_lib,
+  include_directories : gaviotatb_includes)
diff --git a/subprojects/perfetto.wrap b/subprojects/perfetto.wrap
new file mode 100644
index 0000000000..cd29c93dde
--- /dev/null
+++ b/subprojects/perfetto.wrap
@@ -0,0 +1,6 @@
+[wrap-git]
+url = https://github.com/google/perfetto.git
+revision = v50.1
+
+depth = 1
+
diff --git a/third_party/fused_multi_head_attention/CMakeLists.txt b/third_party/fused_multi_head_attention/CMakeLists.txt
new file mode 100644
index 0000000000..4087c3a808
--- /dev/null
+++ b/third_party/fused_multi_head_attention/CMakeLists.txt
@@ -0,0 +1,56 @@
+
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cutlass_example_add_executable(
+  41_fused_multi_head_attention_fixed_seqlen
+  fused_multihead_attention_fixed_seqlen.cu
+  )
+
+cutlass_example_add_executable(
+  41_fused_multi_head_attention_variable_seqlen
+  fused_multihead_attention_variable_seqlen.cu
+  )
+
+cutlass_example_add_executable(
+  41_fused_multi_head_attention_backward
+  fused_multi_head_attention_backward.cu
+  DISABLE_TESTS ON
+  )
+
+
+add_custom_target(41_fused_multi_head_attention
+DEPENDS 41_fused_multi_head_attention_fixed_seqlen
+        41_fused_multi_head_attention_variable_seqlen
+        41_fused_multi_head_attention_backward
+)
+
+add_test(
+  NAME ctest_examples_41_fmha_backward_python
+  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/fmha_backward_test.py $<TARGET_FILE:41_fused_multi_head_attention_backward>
+)
diff --git a/third_party/fused_multi_head_attention/debug_utils.h b/third_party/fused_multi_head_attention/debug_utils.h
new file mode 100644
index 0000000000..a22f12b711
--- /dev/null
+++ b/third_party/fused_multi_head_attention/debug_utils.h
@@ -0,0 +1,234 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+#include <cfloat>
+#include <cstdio>
+#include <cmath>
+
+////////////////////////////////////////////////////////////////////////////////
+// Debugging functions
+////////////////////////////////////////////////////////////////////////////////
+// Nans & inf detection
+#define NANCHECK(frag)                         \
+  {                                            \
+    for (size_t _i = 0; _i < frag.size(); ++_i) { \
+      assert(std::isfinite(float(frag[_i])));  \
+      assert(!std::isnan(float(frag[_i])));    \
+    }                                          \
+  }
+
+// Print on the first thread of the first block
+#if 1
+#define PRINT_WARP_ID 0
+#define PRINT_LANE_ID 0
+#define PRINT_B0_T0(msg, ...)                                         \
+  if (blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 &&        \
+      threadIdx.x == PRINT_LANE_ID && threadIdx.y == PRINT_WARP_ID && \
+      threadIdx.z == 0) {                                             \
+    printf(msg "\n", ##__VA_ARGS__);                                  \
+  }
+#define PRINT_T0(msg, ...)                                            \
+  if (threadIdx.x == PRINT_LANE_ID && threadIdx.y == PRINT_WARP_ID && \
+      threadIdx.z == 0) {                                             \
+    printf(msg "\n", ##__VA_ARGS__);                                  \
+  }
+#define PRINT_TX_LX(msg, ...)                                                 \
+  for (int bx = 0; bx < gridDim.x; ++bx) {                                    \
+    for (int by = 0; by < gridDim.y; ++by) {                                  \
+      for (int bz = 0; bz < gridDim.z; ++bz) {                                \
+        for (int tx = 0; tx < blockDim.x; ++tx) {                             \
+          for (int ty = 0; ty < blockDim.y; ++ty) {                           \
+            for (int tz = 0; tz < blockDim.z; ++tz) {                         \
+              __syncthreads();                                                \
+              if (blockIdx.x == bx && blockIdx.y == by && blockIdx.z == bz && \
+                  threadIdx.x == tx && threadIdx.y == ty &&                   \
+                  threadIdx.z == tz) {                                        \
+                printf(                                                       \
+                    "[%d,%d,%d][%d,%d,%d]" msg "\n",                          \
+                    bx,                                                       \
+                    by,                                                       \
+                    bz,                                                       \
+                    tx,                                                       \
+                    ty,                                                       \
+                    tz,                                                       \
+                    ##__VA_ARGS__);                                           \
+              }                                                               \
+            }                                                                 \
+          }                                                                   \
+        }                                                                     \
+      }                                                                       \
+    }                                                                         \
+  }
+#else
+#define PRINT_B0_T0
+#define PRINT_TX_LX
+#endif
+
+struct __string_view {
+  char const* data;
+  std::size_t size;
+};
+#if __cplusplus >= 201402L
+template <class T>
+constexpr __string_view __get_type_name() {
+  char const* p = __PRETTY_FUNCTION__;
+  while (*p++ != '=')
+    ;
+  for (; *p == ' '; ++p)
+    ;
+  char const* p2 = p;
+  int count = 1;
+  for (;; ++p2) {
+    switch (*p2) {
+      case '[':
+        ++count;
+        break;
+      case ']':
+        --count;
+        if (!count)
+          return {p, std::size_t(p2 - p)};
+    }
+  }
+  return {};
+}
+#else
+template <class T>
+constexpr __string_view __get_type_name() {
+  return {"unsupported", 11};
+}
+#endif
+
+// Print a given array
+#define PRINT_ACCUM8_T0_L0_START(name, accum, start)  \
+  PRINT_B0_T0(                                        \
+      "%s[%d:%d] - {%f, %f, %f, %f, %f, %f, %f, %f}", \
+      name,                                           \
+      int(start),                                     \
+      int(start + 8),                                 \
+      float(accum[start + 0]),                        \
+      float(accum[start + 1]),                        \
+      float(accum[start + 2]),                        \
+      float(accum[start + 3]),                        \
+      float(accum[start + 4]),                        \
+      float(accum[start + 5]),                        \
+      float(accum[start + 6]),                        \
+      float(accum[start + 7]));
+#define PRINT_ACCUM8_T0_L0(name, accum) PRINT_ACCUM8_T0_L0_START(name, accum, 0)
+#define PRINT_FRAG_T0_L0(name, frag)                          \
+  {                                                           \
+    auto typeStr = __get_type_name<decltype(frag)>();         \
+    PRINT_B0_T0("printing %s (%s)", name, typeStr.data);      \
+    for (size_t _start = 0; _start < frag.size(); _start += 8) { \
+      PRINT_ACCUM8_T0_L0_START("  ", frag, _start);           \
+    }                                                         \
+    /*__syncthreads();                                        \
+    NANCHECK(frag); */                                        \
+  }
+#define PRINT_ARRAY_T0_L0_INCR(name, array, length, incr)   \
+  {                                                         \
+    PRINT_B0_T0("printing %s (len=%d)", name, int(length)); \
+    for (int _start = 0; _start < length; _start += incr) { \
+      PRINT_ACCUM8_T0_L0_START("  ", array, _start);        \
+    }                                                       \
+  }
+#define PRINT_ARRAY_T0_L0(name, array, length) \
+  PRINT_ARRAY_T0_L0_INCR(name, array, length, 8)
+
+// Print a 4x4 matrix
+#define PRINT_TENSOR4x4_T0_L0_START(name, ref, start_x, start_y)                                           \
+  PRINT_B0_T0(                                                                                             \
+      "%s[%d:%d, %d:%d]:\n    %f, %f, %f, %f\n    %f, %f, %f, %f\n    %f, %f, %f, %f\n    %f, %f, %f, %f", \
+      name,                                                                                                \
+      int(start_x),                                                                                        \
+      int(start_x + 4),                                                                                    \
+      int(start_y),                                                                                        \
+      int(start_y + 4),                                                                                    \
+      float(ref.at({start_x + 0, start_y + 0})),                                                           \
+      float(ref.at({start_x + 0, start_y + 1})),                                                           \
+      float(ref.at({start_x + 0, start_y + 2})),                                                           \
+      float(ref.at({start_x + 0, start_y + 3})),                                                           \
+      float(ref.at({start_x + 1, start_y + 0})),                                                           \
+      float(ref.at({start_x + 1, start_y + 1})),                                                           \
+      float(ref.at({start_x + 1, start_y + 2})),                                                           \
+      float(ref.at({start_x + 1, start_y + 3})),                                                           \
+      float(ref.at({start_x + 2, start_y + 0})),                                                           \
+      float(ref.at({start_x + 2, start_y + 1})),                                                           \
+      float(ref.at({start_x + 2, start_y + 2})),                                                           \
+      float(ref.at({start_x + 2, start_y + 3})),                                                           \
+      float(ref.at({start_x + 3, start_y + 0})),                                                           \
+      float(ref.at({start_x + 3, start_y + 1})),                                                           \
+      float(ref.at({start_x + 3, start_y + 2})),                                                           \
+      float(ref.at({start_x + 3, start_y + 3})));
+#define PRINT_TENSOR4x4_T0_L0(name, ref) \
+  PRINT_TENSOR4x4_T0_L0_START(name, ref, 0, 0)
+
+#define PRINT_PROBLEM_SIZE(name, ps)            \
+  PRINT_B0_T0(                                  \
+      "%s.problem_size: {.m=%d, .n=%d, .k=%d}", \
+      name,                                     \
+      int(ps.m()),                              \
+      int(ps.n()),                              \
+      int(ps.k()))
+
+template <typename LambdaIterator, typename LaneOffsetT, typename AccumT>
+CUTLASS_DEVICE void print_warp_accum(
+    AccumT accum,
+    LaneOffsetT lane_offset,
+    int32_t num_rows,
+    int32_t num_cols) {
+  bool is_main = blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 &&
+      threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0;
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      if (col % 32 == 0) {
+        if (is_main) {
+          printf("\nmat[%3d, %3d:%3d]", row, col, col + 32);
+        }
+        __syncthreads();
+      }
+      LambdaIterator::iterateRows(
+          lane_offset,
+          [&](int accum_m) {},
+          [&](int accum_m, int accum_n, int idx) {
+            if (row == accum_m && col == accum_n &&
+                (blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0)) {
+              printf(" %6.1f", float(accum[idx]));
+            }
+          },
+          [&](int accum_m) {});
+      __syncthreads();
+    }
+    if (is_main) {
+      printf("\n");
+    }
+  }
+}
diff --git a/third_party/fused_multi_head_attention/default_fmha_grouped.h b/third_party/fused_multi_head_attention/default_fmha_grouped.h
new file mode 100644
index 0000000000..14604f10c3
--- /dev/null
+++ b/third_party/fused_multi_head_attention/default_fmha_grouped.h
@@ -0,0 +1,299 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "fmha_grouped.h"
+#include "gemm_kernel_utils.h"
+#include "gemm/custom_mma.h"
+#include "gemm/find_default_mma.h"
+#include "gemm/mma_from_smem.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The datatype of Q/K/V
+    typename scalar_t_,
+    // Architecture we are targeting (eg `cutlass::arch::Sm80`)
+    typename ArchTag_,
+    // If Q/K/V are correctly aligned in memory and we can run a fast kernel
+    bool isAligned_,
+    int kQueriesPerBlock,
+    int kKeysPerBlock,
+    int kMaxK = (int)cutlass::platform::numeric_limits<uint32_t>::max(),
+    GroupScheduleMode GroupScheduleMode_ = GroupScheduleMode::kDeviceOnly
+    >
+struct DefaultFMHAGrouped {
+  using scalar_t = scalar_t_;
+  using accum_t = float;
+  using output_t = scalar_t;
+
+  // Accumulator between 2 iterations
+  // Using `accum_t` improves perf on f16 at the cost of
+  // numerical errors
+  using output_accum_t = accum_t;
+
+  using ArchTag = ArchTag_;
+  static bool const kIsAligned = isAligned_;
+  static bool const kSingleValueIteration = kMaxK <= kKeysPerBlock;
+  static constexpr bool kIsHalf = cutlass::sizeof_bits<scalar_t>::value == 16;
+  static int const kWarpSize = 32;
+  static int const kNumWarpsPerBlock = kQueriesPerBlock * kKeysPerBlock / (kWarpSize * kWarpSize);
+
+  struct MM0 {
+    /*
+      In this first matmul, we compute a block of `Q @ K.T`.
+      While the calculation result is still hot in registers, we update
+      `mi`, `m_prime`, `s_prime` in shared-memory, and then store this value
+      into a shared-memory ("AccumulatorSharedStorage") that is used later as
+      operand A for the second matmul (see MM1)
+    */
+
+    using GemmType = gemm_kernel_utils::DefaultGemmType<ArchTag, scalar_t>;
+    using OpClass = typename GemmType::OpClass;
+
+    using ElementA = scalar_t;
+    using ElementB = scalar_t;
+    using ElementC = scalar_t;
+    using ElementAccumulator = accum_t;
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+
+    using DefaultConfig =
+        typename cutlass::gemm::device::DefaultGemmConfiguration<
+            OpClass,
+            ArchTag,
+            ElementA,
+            ElementB,
+            ElementC,
+            ElementAccumulator
+            >;
+
+    static int const kAlignmentA =
+        kIsAligned ? DefaultConfig::kAlignmentA : GemmType::kMinimumAlignment;
+    static int const kAlignmentB =
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment;
+
+    using ThreadblockShape = cutlass::gemm::GemmShape<kQueriesPerBlock, kKeysPerBlock, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+    using InstructionShape = typename GemmType::InstructionShape;
+
+    static int const kStages = DefaultConfig::kStages;
+    using Operator = typename GemmType::Operator;
+
+    using DefaultMma = typename cutlass::gemm::threadblock::FindDefaultMma<
+        ElementA,
+        LayoutA,
+        kAlignmentA,
+        ElementB,
+        LayoutB,
+        kAlignmentB,
+        ElementAccumulator,
+        LayoutC,
+        OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        InstructionShape,
+        ArchTag::kMinComputeCapability >= 80 && kIsHalf
+            ? 4
+            : DefaultConfig::kStages,
+        Operator
+        >::DefaultMma;
+
+    using MmaCore = typename DefaultMma::MmaCore;
+    using IteratorA = typename DefaultMma::IteratorA;
+    using IteratorB = typename DefaultMma::IteratorB;
+    using DefaultThreadblockMma = typename DefaultMma::ThreadblockMma;
+    using Mma = typename cutlass::platform::conditional<
+        kSingleValueIteration,
+        typename MakeCustomMma<DefaultThreadblockMma, kMaxK>::Mma,
+        DefaultThreadblockMma>::type;
+    using AccumLambdaIterator = typename DefaultMmaAccumLambdaIterator<
+        typename Mma::Operator::IteratorC,
+        ElementAccumulator,
+        kWarpSize>::Iterator;
+
+    static_assert(MmaCore::WarpCount::kCount == kNumWarpsPerBlock, "");
+
+    // Epilogue to store to shared-memory in a format that we can use later for
+    // the second matmul
+    using B2bGemm = typename cutlass::gemm::threadblock::B2bGemm<
+        typename Mma::Operator::IteratorC,
+        typename Mma::Operator,
+        scalar_t,
+        WarpShape,
+        ThreadblockShape>;
+    using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage;
+  };
+
+  struct MM1 {
+    /*
+      Second matmul: perform `attn @ V` where `attn` is the attention (not
+      normalized) and stored in shared memory
+    */
+
+    using GemmType = typename MM0::GemmType;
+    using OpClass = typename GemmType::OpClass;
+
+    using ElementA = scalar_t;
+    using ElementB = scalar_t;
+    using ElementC = output_accum_t;
+    using ElementAccumulator = accum_t;
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::RowMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+
+    using DefaultConfig =
+        typename cutlass::gemm::device::DefaultGemmConfiguration<
+            OpClass,
+            ArchTag,
+            ElementA,
+            ElementB,
+            ElementC,
+            ElementAccumulator
+            >;
+
+    static int const kAlignmentA = DefaultConfig::kAlignmentA;
+    static int const kAlignmentB =
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment;
+
+    using ThreadblockShape = typename MM0::ThreadblockShape;
+    using WarpShape = typename MM0::WarpShape;
+    using InstructionShape = typename MM0::InstructionShape;
+
+    using EpilogueOutputOp = typename DefaultConfig::EpilogueOutputOp;
+
+    static int const kStages = DefaultConfig::kStages;
+    using Operator = typename GemmType::Operator;
+
+    using ThreadblockSwizzle = void; // Swizzling is unused
+    static bool const kSplitKSerial = false;
+
+    using DefaultGemm = cutlass::gemm::kernel::DefaultGemm<
+        ElementA,
+        LayoutA,
+        kAlignmentA,
+        ElementB,
+        LayoutB,
+        kAlignmentB,
+        ElementC,
+        LayoutC,
+        ElementAccumulator,
+        OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        InstructionShape,
+        EpilogueOutputOp,
+        ThreadblockSwizzle,
+        ArchTag::kMinComputeCapability >= 80 && kIsHalf
+            ? 4
+            : DefaultConfig::kStages,
+        kSplitKSerial,
+        Operator>;
+
+    using WarpIteratorA = typename cutlass::gemm::threadblock::
+    DefaultWarpIteratorAFromSharedMemory<
+        typename DefaultGemm::Mma::Policy::Operator::Shape, // WarpShape
+        typename DefaultGemm::Mma::Policy::Operator::InstructionShape,
+        typename DefaultGemm::Mma::Policy::Operator::IteratorA,
+        typename DefaultGemm::Mma::Policy>::WarpIterator;
+
+    using DefaultMmaFromSmem =
+        typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            MM0::AccumulatorSharedStorage::Shape::kN,  // kMaxK
+            WarpIteratorA,
+            false>; // kScaleOperandA
+
+    using Mma = typename DefaultMmaFromSmem::Mma;
+    using IteratorB = typename Mma::IteratorB;
+    using WarpCount = typename Mma::WarpCount;
+    static_assert(WarpCount::kCount == kNumWarpsPerBlock, "");
+
+    using DefaultEpilogue = typename DefaultGemm::Epilogue;
+    using OutputTileIterator =
+        typename cutlass::epilogue::threadblock::PredicatedTileIterator<
+            typename DefaultEpilogue::OutputTileIterator::ThreadMap,
+            output_t>;
+    using OutputTileIteratorAccum =
+        typename cutlass::epilogue::threadblock::PredicatedTileIterator<
+            typename DefaultEpilogue::OutputTileIterator::ThreadMap,
+            output_accum_t>;
+  };
+
+/// Define the kernel in terms of the default kernel
+  using FMHAKernel = kernel::FMHAGrouped<
+    MM0,
+    MM1,
+    scalar_t,
+    accum_t,
+    output_t,
+    output_accum_t,
+    kSingleValueIteration,
+    GroupScheduleMode_
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/epilogue/epilogue_pipelined.h b/third_party/fused_multi_head_attention/epilogue/epilogue_pipelined.h
new file mode 100644
index 0000000000..9ed17f4b4e
--- /dev/null
+++ b/third_party/fused_multi_head_attention/epilogue/epilogue_pipelined.h
@@ -0,0 +1,624 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  File copied from "cutlass/epilogue/threadblock/epilogue.h"
+  then modified to:
+  (1) load 2 source fragments at the same time (pipelining)
+  (2) support reading from a different dtype
+  (3) pass the row id to the OutputOp if it takes it
+    (see MemoryEfficientAttentionNormalize)
+  Note that in general the fragment passed to the OutputOp could
+  span multiple rows but it does not happen with the configurations we have
+*/
+
+#pragma once
+
+#include <cuda/std/cassert>
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+template <typename Op>
+struct ApplyEpilogueOp {
+  static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+      Op const& output_op,
+      int row_id,
+      typename Op::FragmentAccumulator const& accum,
+      typename Op::FragmentOutput const& source) {
+    return output_op(accum, source);
+  }
+  static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+      Op const& output_op,
+      int row_id,
+      typename Op::FragmentAccumulator const& accum) {
+    return output_op(accum);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <
+    typename Shape_, ///< Shape of threadblock tile (concept: GemmShape)
+    typename WarpMmaOperator_, ///< Warp-level MMA operator (concept:
+                               ///< gemm::warp::MmaTensorOp)
+    int PartitionsK, ///< Number of partitions of the K dimension
+    typename OutputTileIterator_, ///< Tile iterator writing output tensors
+    typename AccumulatorFragmentIterator_, ///< Fragment iterator selecting
+                                           ///< accumulators
+    typename WarpTileIterator_, ///< Warp-scoped tile iterator writing
+                                ///< accumulators to SMEM
+    typename SharedLoadIterator_, ///< Threadblock-scoped tile iterator loading
+                                  ///< from SMEM
+    typename OutputOp_, ///< Output operator
+    typename Padding_, ///< Padding added to SMEM allocation to avoid bank
+                       ///< conflicts (concept: MatrixShape)
+    int FragmentsPerPartition =
+        1, ///< Used to coarsten the epilogue granularity
+    int IterationsUnroll = ///< Used to reduce binary size when epilogue op is
+                           ///< large
+    (!IsEpilogueFunctorHeavy<OutputOp_>::value),
+    typename OutputTileSourceIterator_ =
+        OutputTileIterator_ ///< Tile iterator reading tensors
+    >
+class EpiloguePipelined : public EpilogueBase<
+                              Shape_,
+                              typename WarpMmaOperator_::Shape,
+                              PartitionsK,
+                              AccumulatorFragmentIterator_,
+                              WarpTileIterator_,
+                              Padding_,
+                              FragmentsPerPartition> {
+ public:
+  using Base = EpilogueBase<
+      Shape_,
+      typename WarpMmaOperator_::Shape,
+      PartitionsK,
+      AccumulatorFragmentIterator_,
+      WarpTileIterator_,
+      Padding_,
+      FragmentsPerPartition>;
+
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using OutputTileSourceIterator = OutputTileSourceIterator_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename Base::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+  using ElementSource = typename OutputTileSourceIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef =
+      typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Array type used to output
+  using OutputAccessType = Array<
+      typename OutputTileIterator::Element,
+      OutputTileIterator::kElementsPerAccess>;
+  using SourceAccessType = Array<
+      typename OutputTileSourceIterator::Element,
+      OutputTileSourceIterator::kElementsPerAccess>;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<
+      typename WarpTileIterator::Element,
+      OutputTileIterator::kElementsPerAccess>;
+
+  /// Number of warps
+  using WarpCount = typename Base::WarpCount;
+
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1
+      ? Base::kFragmentsPerIteration
+      : kPartitionsK;
+  static int constexpr kSmemPointerOffset =
+      Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+
+ public:
+  static_assert(
+      OutputTileSourceIterator::Fragment::kElements ==
+          OutputTileIterator::Fragment::kElements,
+      "Mismatch between input tile and output tile iterator (kElements)");
+  static_assert(
+      OutputTileSourceIterator::kIterations == OutputTileIterator::kIterations,
+      "Mismatch between input tile and output tile iterator (kIterations)");
+  static_assert(
+      SharedLoadIterator::Fragment::kElements ==
+          OutputTileIterator::Fragment::kElements,
+      "Mismatch between shared load iterator and output tile iterator.");
+
+  static_assert(
+      OutputTileIterator::kElementsPerAccess,
+      "OutputTileIterator::kElementsPerAccess must not be zero.");
+
+  static_assert(
+      !(OutputTileIterator::Fragment::kElements %
+        OutputTileIterator::kElementsPerAccess),
+      "Divisibility");
+
+ private:
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+
+ public:
+  /// Constructor
+  CUTLASS_DEVICE
+  EpiloguePipelined(
+      typename Base::SharedStorage& shared_storage, ///< Shared storage object
+      int thread_idx, ///< ID of a thread within the threadblock
+      int warp_idx, ///< ID of warp within threadblock
+      int lane_idx ///< Id of thread within warp
+      )
+      : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+        shared_load_iterator_(shared_storage.reference(), thread_idx) {}
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+      OutputOp const& output_op, ///< Output operator
+      OutputTileIterator
+          destination_iterator, ///< Tile iterator for destination
+      AccumulatorTile const&
+          accumulators, ///< Complete warp-level accumulator tile
+      OutputTileSourceIterator
+          source_iterator) { ///< Threadblock tile coordinate in GEMM (in units
+                             ///< of threadblock tiles)
+
+    if (!output_op.is_source_needed()) {
+      compute_source_not_needed_(output_op, destination_iterator, accumulators);
+    } else {
+      compute_source_needed_(
+          output_op, destination_iterator, accumulators, source_iterator);
+    }
+  }
+  CUTLASS_DEVICE
+  void operator()(
+      OutputOp const& output_op, ///< Output operator
+      OutputTileIterator
+          destination_iterator, ///< Tile iterator for destination
+      AccumulatorTile const&
+          accumulators) { ///< Complete warp-level accumulator tile
+    compute_source_not_needed_(output_op, destination_iterator, accumulators);
+  }
+
+ private:
+  template <class Seq>
+  struct acc2smem_source_not_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_not_needed<cutlass::index_sequence<Seq...>> {
+    template <int Advance>
+    CUTLASS_DEVICE static void helper(
+        AccumulatorFragmentIterator accum_fragment_iterator,
+        WarpTileIterator& warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+        typename AccumulatorFragmentIterator::Fragment accum_fragment;
+
+        accum_fragment_iterator.load(accum_fragment);
+        ++accum_fragment_iterator;
+
+        warp_tile_iterator.store(accum_fragment);
+        if (p < Base::kFragmentsPerIteration - 1) {
+          warp_tile_iterator.add_pointer_offset(kSmemPointerOffset);
+        }
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        warp_tile_iterator.add_pointer_offset(
+            kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+    }
+
+    CUTLASS_DEVICE
+    static void push(
+        size_t pos,
+        AccumulatorFragmentIterator const& iterator_begin,
+        WarpTileIterator& warp_tile_iterator) {
+      int dummy[] = {
+          (pos == (Seq * Base::kFragmentsPerIteration)) &&
+          (helper<Seq * Base::kFragmentsPerIteration>(
+               iterator_begin, warp_tile_iterator),
+           0)...};
+
+      CUTLASS_UNUSED(dummy[0]);
+    }
+  };
+
+  static_assert(
+      kPartitionsK == 1 || Base::kFragmentsPerIteration == 1,
+      "One of these must be exactly 1.");
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_not_needed_(
+      OutputOp const& output_op, ///< Output operator
+      OutputTileIterator
+          destination_iterator, ///< Tile iterator for destination
+      AccumulatorTile const&
+          accumulators ///< Complete warp-level accumulator tile
+  ) {
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+#pragma unroll(                                                          \
+    IterationsUnroll                                                     \
+        ? OutputTileIterator::kIterations / Base::kFragmentsPerIteration \
+        : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations;
+         iter += Base::kFragmentsPerIteration) {
+      //
+      // Convert and store fragment
+      //
+
+      __syncthreads();
+
+      acc2smem_source_not_needed<cutlass::make_index_sequence<
+          OutputTileIterator::kIterations / Base::kFragmentsPerIteration>>::
+          push(iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+        typename SharedLoadIterator::Fragment
+            aligned_accum_fragment[kPartitionsK];
+
+        shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+        if (p < Base::kFragmentsPerIteration - 1) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+        } else if (kPartitionsK > 1) {
+          plus<typename SharedLoadIterator::Fragment> add_fragments;
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 1; i < kPartitionsK; ++i) {
+            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+            shared_load_iterator_.load(aligned_accum_fragment[i]);
+            aligned_accum_fragment[0] = add_fragments(
+                aligned_accum_fragment[0], aligned_accum_fragment[i]);
+          }
+
+          shared_load_iterator_.add_pointer_offset(
+              (1 - kPartitionsK) * kSmemPointerOffset);
+        }
+
+        //
+        // Compute the output result
+        //
+
+        typename OutputTileIterator::Fragment output_fragment;
+
+        apply_output_operator_source_not_needed_(
+            destination_iterator.thread_start_row(),
+            output_fragment,
+            output_op,
+            aligned_accum_fragment[0]);
+
+        //
+        // Store the final result
+        //
+
+        destination_iterator.store(output_fragment);
+        ++destination_iterator;
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        shared_load_iterator_.add_pointer_offset(
+            kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+    }
+  }
+
+  template <class Seq>
+  struct acc2smem_source_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
+    template <int Advance>
+    CUTLASS_DEVICE static void helper(
+        AccumulatorFragmentIterator accum_fragment_iterator,
+        WarpTileIterator& warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      warp_tile_iterator.store(accum_fragment);
+    }
+
+    CUTLASS_DEVICE
+    static void push(
+        size_t pos,
+        AccumulatorFragmentIterator const& iterator_begin,
+        WarpTileIterator& warp_tile_iterator) {
+      int dummy[] = {
+          (pos == Seq) &&
+          (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+    }
+  };
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_needed_(
+      OutputOp const& output_op, ///< Output operator
+      OutputTileIterator
+          destination_iterator, ///< Tile iterator for destination
+      AccumulatorTile const&
+          accumulators, ///< Complete warp-level accumulator tile
+      OutputTileSourceIterator
+          source_iterator ///< Threadblock tile coordinate in GEMM (in units of
+                          ///< threadblock tiles)
+  ) {
+    typename OutputTileSourceIterator::Fragment source_fragment[2];
+
+    source_fragment[0].clear();
+    source_iterator.load(source_fragment[0]);
+    ++source_iterator;
+    source_fragment[1].clear();
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+#pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+      if (iter > 0) {
+        __syncthreads();
+      }
+      //
+      // Load the source for next iteration (pipelining)
+      //
+
+      if (iter + 1 < OutputTileIterator::kIterations) {
+        source_iterator.load(source_fragment[(iter + 1) % 2]);
+      }
+      ++source_iterator;
+      acc2smem_source_needed<
+          cutlass::make_index_sequence<OutputTileIterator::kIterations>>::
+          push(iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      typename SharedLoadIterator::Fragment
+          aligned_accum_fragment[kPartitionsK];
+
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+      // If the number of k-slices is > 1 - perform a reduction amongst the
+      // k-slices
+      if (kPartitionsK > 1) {
+        plus<typename SharedLoadIterator::Fragment> add_fragments;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(
+              aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+
+        shared_load_iterator_.add_pointer_offset(
+            (1 - kPartitionsK) * kSmemPointerOffset);
+      }
+
+      //
+      // Compute the output result
+      //
+
+      typename OutputTileIterator::Fragment output_fragment;
+
+      apply_output_operator_(
+          destination_iterator.thread_start_row(),
+          output_fragment,
+          output_op,
+          aligned_accum_fragment[0],
+          source_fragment[iter % 2]);
+
+      //
+      // Store the final result
+      //
+
+      destination_iterator.store(output_fragment);
+      ++destination_iterator;
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_(
+      int begin_row,
+      typename OutputTileIterator::Fragment& output_fragment,
+      OutputOp const& output_op, ///< Output operator
+      typename SharedLoadIterator::Fragment const& aligned_accum_fragment,
+      typename OutputTileSourceIterator::Fragment const& source_fragment) {
+    OutputAccessType* output_frag_ptr =
+        reinterpret_cast<OutputAccessType*>(&output_fragment);
+
+    AccumulatorAccessType const* compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const*>(&aligned_accum_fragment);
+
+    SourceAccessType const* source_frag_ptr =
+        reinterpret_cast<SourceAccessType const*>(&source_fragment);
+
+    int const kOutputOpIterations = OutputTileIterator::Fragment::kElements /
+        OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+      // Call the output operator
+      output_frag_ptr[i] = ApplyEpilogueOp<OutputOp>::apply(
+          output_op,
+          begin_row + getRowOffset(i * OutputTileIterator::kElementsPerAccess),
+          compute_frag_ptr[i],
+          source_frag_ptr[i]);
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_source_not_needed_(
+      int begin_row,
+      typename OutputTileIterator::Fragment& output_fragment,
+      OutputOp const& output_op, ///< Output operator
+      typename SharedLoadIterator::Fragment const& aligned_accum_fragment) {
+    OutputAccessType* output_frag_ptr =
+        reinterpret_cast<OutputAccessType*>(&output_fragment);
+
+    AccumulatorAccessType const* compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const*>(&aligned_accum_fragment);
+
+    int const kOutputOpIterations = OutputTileIterator::Fragment::kElements /
+        OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+      // Call the output operator
+      output_frag_ptr[i] = ApplyEpilogueOp<OutputOp>::apply(
+          output_op,
+          begin_row + getRowOffset(i * OutputTileIterator::kElementsPerAccess),
+          compute_frag_ptr[i]);
+    }
+  }
+
+  // This should be constexpr, but it's only supported on c++14
+  static int CUTLASS_HOST_DEVICE getRowOffset(int i) {
+    using ThreadMap = typename OutputTileIterator::ThreadMap;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            int frag_idx = ThreadMap::kElementsPerAccess *
+                (frag_row_idx * ThreadMap::Iterations::kColumn + column);
+            if (i < frag_idx + ThreadMap::kElementsPerAccess) {
+              return row_offset;
+            }
+          }
+        }
+      }
+    }
+    return -1;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/epilogue/epilogue_rescale_output.h b/third_party/fused_multi_head_attention/epilogue/epilogue_rescale_output.h
new file mode 100644
index 0000000000..973ec3459b
--- /dev/null
+++ b/third_party/fused_multi_head_attention/epilogue/epilogue_rescale_output.h
@@ -0,0 +1,254 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory
+  to match canonical tensor layouts in global memory. Epilogues support
+  conversion and reduction operations.
+
+  This is a copy of cutlass/epilogue/threadblock/epilogue.h that can
+  handle "row_id" as a first argument, as uses it to get the corresponding
+  `m_prime` / `s_prime` to rescale the output.
+*/
+
+#pragma once
+
+#include <cuda/std/cassert>
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "epilogue_pipelined.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+// output <- alpha * accumulator + beta * source
+//   with:
+//     alpha = 1 / s_prime (to normalize when isLast=True, 1 otherwise)
+//     beta = alpha / m_prime (renormalize the output when the max changes)
+//     source is the current output
+template <
+    typename ElementOutput_, ///< Data type used to store tensors
+    typename ElementSource_, //< Data type for source (usually matches
+                             //`ElementOutput`)
+    int Count, ///< Number of elements computed per operation.
+               ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+               ///< but we use 64 or 32 sometimes when there are not enough data
+               ///< to store
+    typename ElementAccumulator_, ///< Accumulator data type
+    typename ElementCompute_, ///< Data type used to compute linear combination
+    bool isFirst,
+    bool isLast,
+    typename FragmentAlphaBeta_,
+    FloatRoundStyle Round = FloatRoundStyle::round_to_nearest>
+class MemoryEfficientAttentionNormalize {
+ public:
+  using ElementOutput = ElementOutput_;
+  using ElementSource = ElementSource_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+
+  static int const kCount = Count;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentSource = Array<ElementSource, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using ComputeFragment = Array<ElementCompute, kCount>;
+  using FragmentAlphaBeta = FragmentAlphaBeta_;
+
+  static FloatRoundStyle const kRound = Round;
+
+ private:
+  //
+  // Data members
+  //
+
+  FragmentAlphaBeta const& s_prime_;
+  FragmentAlphaBeta const& m_prime_;
+
+ public:
+  /// Constructs the function object, possibly loading from pointers in host
+  /// memory
+  CUTLASS_HOST_DEVICE
+  MemoryEfficientAttentionNormalize(
+      FragmentAlphaBeta const& s_prime,
+      FragmentAlphaBeta const& m_prime)
+      : s_prime_(s_prime), m_prime_(m_prime) {}
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return !isFirst;
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {}
+
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+      int row,
+      FragmentAccumulator const& accumulator,
+      FragmentSource const& source) const {
+    assert(!isFirst);
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round>
+        source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
+        accumulator_converter;
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+        destination_converter;
+
+    ComputeFragment converted_source = source_converter(source);
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_add_source;
+    multiply_add<ComputeFragment> mul_add_accumulator;
+
+    ElementCompute alpha = isLast ? (1 / s_prime_[row]) : 1;
+    ElementCompute beta = alpha * m_prime_[row];
+
+    intermediate = mul_add_source(beta, converted_source); // X =  beta * C
+
+    intermediate = mul_add_accumulator(
+        alpha, converted_accumulator, intermediate); // D = alpha * Accum + X
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(int row, FragmentAccumulator const& accumulator)
+      const {
+    assert(isFirst);
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
+        accumulator_converter;
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+        destination_converter;
+
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    ComputeFragment intermediate;
+    multiplies<ComputeFragment> mul_accumulator;
+
+    ElementCompute alpha = isLast ? (1 / s_prime_[row]) : 1;
+
+    intermediate = mul_accumulator(
+        alpha, converted_accumulator); // X =  alpha * C + uniform
+
+    return destination_converter(intermediate);
+  }
+};
+
+} // namespace thread
+
+namespace threadblock {
+template <
+    typename EO,
+    typename ES,
+    int Count,
+    typename EA,
+    typename EC,
+    bool F,
+    bool L,
+    typename FAB,
+    FloatRoundStyle R>
+struct ApplyEpilogueOp<thread::MemoryEfficientAttentionNormalize<
+    EO,
+    ES,
+    Count,
+    EA,
+    EC,
+    F,
+    L,
+    FAB,
+    R>> {
+  using Op = thread::
+      MemoryEfficientAttentionNormalize<EO, ES, Count, EA, EC, F, L, FAB, R>;
+  static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+      Op const& output_op,
+      int row_id,
+      typename Op::FragmentAccumulator const& accum,
+      typename Op::FragmentSource const& source) {
+    return output_op(row_id, accum, source);
+  }
+  static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+      Op const& output_op,
+      int row_id,
+      typename Op::FragmentAccumulator const& accum) {
+    return output_op(row_id, accum);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/epilogue/epilogue_thread_apply_logsumexp.h b/third_party/fused_multi_head_attention/epilogue/epilogue_thread_apply_logsumexp.h
new file mode 100644
index 0000000000..b110abeced
--- /dev/null
+++ b/third_party/fused_multi_head_attention/epilogue/epilogue_thread_apply_logsumexp.h
@@ -0,0 +1,174 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination operations used by epilogues.
+*/
+
+#pragma once
+
+#include <cuda_fp16.h>
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <typename Element, int ElementsPerAccess>
+struct ArrayExponential {
+  CUTLASS_HOST_DEVICE
+  Array<Element, ElementsPerAccess> operator()(
+      Array<Element, ElementsPerAccess> const& input) const {
+    Array<Element, ElementsPerAccess> result;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      result[i] = expf(input[i]);
+    }
+
+    return result;
+  }
+};
+
+template <int ElementsPerAccess>
+struct ArrayExponential<half_t, ElementsPerAccess> {
+  CUTLASS_DEVICE
+  Array<half_t, ElementsPerAccess> operator()(
+      Array<half_t, ElementsPerAccess> const& input) const {
+    Array<half_t, ElementsPerAccess> result;
+
+    int const kVectorCount = ElementsPerAccess / 2;
+
+    __half2 const* input_ptr =
+        reinterpret_cast<__half2 const*>(input.raw_data());
+    __half2* res_ptr = reinterpret_cast<__half2*>(result.raw_data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kVectorCount; ++i) {
+      res_ptr[i] = h2exp(input_ptr[i]);
+    }
+
+    return result;
+  }
+};
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies:
+/// output <- (input - lse).exp()
+template <
+    typename ElementOutput_, // output
+    typename ElementLSE_, // accumulator from LSE
+    typename ElementAccumulator_, // accumulator from matmul
+    typename ElementCompute_, // intermediate compute (and exp calculation)
+    int ElementsPerAccess>
+class ApplyLogSumExp {
+ public:
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementLSE = ElementLSE_;
+
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kCount = kElementsPerAccess;
+  static const ScaleType::Kind kScale =
+      cutlass::epilogue::thread::ScaleType::NoBetaScaling;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
+  using FragmentLSE = Array<ElementLSE, kElementsPerAccess>;
+  using FragmentScaleBias = FragmentLSE; // Used by epilogue_smem_accumulator.h
+
+ public:
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  ApplyLogSumExp() {}
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return true;
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {}
+
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+      FragmentAccumulator const& AB,
+      FragmentLSE const& scale_unused,
+      // bias used as LSE
+      FragmentLSE const& bias) const {
+    FragmentCompute frag_AB = NumericArrayConverter<
+        ElementCompute,
+        ElementAccumulator,
+        kElementsPerAccess>()(AB);
+    FragmentCompute frag_lse_compute =
+        NumericArrayConverter<ElementCompute, ElementLSE, kElementsPerAccess>()(
+            bias);
+    FragmentCompute frag_compute;
+
+    minus<FragmentCompute> minus_lse;
+    detail::ArrayExponential<ElementCompute, kElementsPerAccess> apply_exp;
+    frag_compute = minus_lse(frag_AB, frag_lse_compute);
+    frag_compute = apply_exp(frag_compute);
+
+    return NumericArrayConverter<
+        ElementOutput,
+        ElementCompute,
+        kElementsPerAccess>()(frag_compute);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/fmha_backward_test.py b/third_party/fused_multi_head_attention/fmha_backward_test.py
new file mode 100644
index 0000000000..8bc25462ac
--- /dev/null
+++ b/third_party/fused_multi_head_attention/fmha_backward_test.py
@@ -0,0 +1,232 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import argparse
+import torch
+import sys
+import os
+from piped_subprocess import PipedSubprocess, TORCH_DTYPE_NAME
+import math
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("example_exe", type=str, help="Path to the 41_fused_multi_head_attention_backward executable")
+args = parser.parse_args()
+
+torch.manual_seed(0)
+dtype = torch.float16
+B, Mq, Mkv, H, K, Kv = 2, 1024, 1024, 5, 128, 128
+causal = True
+repeat_count = 100
+
+ATOL = {
+    torch.float: 5e-4,
+    torch.half: 9.5e-2,
+    torch.bfloat16: 7e-1,
+}[dtype]
+
+RTOL = {
+    torch.float: 1e-4,
+    torch.half: 2e-2,
+    torch.bfloat16: 1e-1,
+}[dtype]
+
+
+assert not (causal and Mq < Mkv), "causal only supports seqlenK <= seqlenQ"
+
+fmha_bw_binary = args.example_exe
+if not os.path.isfile(fmha_bw_binary):
+    print(f"""No such file: `{fmha_bw_binary}`\nDid you forget to run "make 41_fused_multi_head_attention"?""")
+    sys.exit(1)
+
+def create_lower_triangular_mask():
+    return torch.triu(torch.full(  # type: ignore
+        [1, Mq, Mkv],
+        dtype=dtype,
+        fill_value=float("-inf"),
+    ), diagonal=1)
+
+def ref_mha_bmk(q, k, v, mask):
+    # Multi-head attention with inputs/outputs in BMK format
+    q = q.float()
+    k = k.float()
+    v = v.float()
+
+    q = q * (1 / q.shape[-1] ** 0.5)
+    attn = q @ k.transpose(-2, -1)
+    if mask is not None:
+        attn += mask
+    attn_max = attn.max(-1, True).values
+    attn_norm = (attn - attn_max).exp().sum(-1, True)
+    attn = attn.softmax(-1)
+    lse = attn_max + attn_norm.log()
+    lse = lse.squeeze(2)
+    return attn @ v, lse
+
+
+def bmhk2bmk(t):
+    return t.permute((0, 2, 1, 3)).reshape(
+        [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
+    )
+
+def ref_mha_bmhk(q, k, v, mask):
+    # Multi-head attention with inputs/outputs in BMHK format
+    assert q.ndim == 4
+
+    out, lse = ref_mha_bmk(bmhk2bmk(q), bmhk2bmk(k), bmhk2bmk(v), mask=mask)
+    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
+    return out.permute((0, 2, 1, 3)), lse.reshape([q.shape[0], q.shape[2], q.shape[1]])
+
+def ref_mha_bw_bmhk(q, k, v, mask, lse, out, grad_out, delta):
+    lse = lse[:, :, :q.shape[1]]  #BMH, unpad Q dimension
+    delta = delta.reshape([-1, delta.shape[-1], 1])
+
+    # bmhk -> bmk
+    q, k, v, out, grad_out = [bmhk2bmk(x).float() for x in (q, k, v, out, grad_out)]
+
+    attn_T = k @ q.transpose(-2, -1)
+    if mask is not None:
+        attn_T += mask.transpose(-2, -1)
+    attn_T = attn_T * (1 / q.shape[-1] ** 0.5)
+    attn_T = attn_T - lse.reshape([-1, 1, lse.shape[-1]])
+    attn_T = attn_T.exp()
+
+    grad_v = attn_T @ grad_out
+
+    dov = grad_out @ v.transpose(-2, -1)
+    tmp = (dov - delta) * attn_T.transpose(-2, -1)
+    tmp = tmp / (q.shape[-1] ** 0.5)
+
+    grad_q = tmp @ k
+    grad_k = tmp.transpose(-2, -1) @ q
+
+    return [x.reshape([B, H, x.shape[1], x.shape[-1]]).permute([0, 2, 1, 3]) for x in [grad_q, grad_k, grad_v]]
+
+
+print("initializing tensors...")
+query = torch.randn([B, Mq, H, K], dtype=dtype)
+key = 3 * torch.randn([B, Mkv, H, K], dtype=dtype)
+value = 3 * torch.randn([B, Mkv, H, Kv], dtype=dtype)
+mask = create_lower_triangular_mask() if causal else None
+
+# let PyTorch compute gradients
+query.requires_grad_(True)
+key.requires_grad_(True)
+value.requires_grad_(True)
+
+print("computing fw...")
+out, lse = ref_mha_bmhk(query, key, value, mask=mask)
+out = out.to(dtype).contiguous()
+grad_out = 3 * torch.randn([B, Mq, H, Kv], dtype=dtype)
+
+print("computing bw with autograd...")
+out.backward(grad_out)
+scale = (1 / query.shape[-1] ** 0.5)
+
+
+# Additional data needed by the kernel
+delta = (grad_out.float() * out.float()).sum(-1).transpose(-2, -1).contiguous()
+pad_amount = (32 - (lse.shape[2] % 32)) % 32
+lse = torch.nn.functional.pad(lse, [0, pad_amount], value=math.inf)
+
+print("computing bw with reference implem...")
+gQr, gKr, gVr = ref_mha_bw_bmhk(query, key, value, mask, lse, out, grad_out, delta)
+
+with PipedSubprocess(fmha_bw_binary) as bw_kernel:
+    # Send kernel arguments
+    bw_kernel.write(
+        TORCH_DTYPE_NAME[query.dtype],
+        "scale", scale,
+        "head_dim", K,
+        "head_dim_value", Kv,
+        "num_queries", Mq,
+        "num_keys", Mkv,
+        "num_heads", H,
+        "custom_mask_type", (1 if causal else 0),
+        "num_batches", B,
+        "repeat_count", repeat_count,
+        "num_splits_key", (Mkv // 128),
+    )
+    bw_kernel.writeTensor(query, "query", ["q_strideB", "q_strideM", "q_strideH"])
+    bw_kernel.writeTensor(key, "key", ["k_strideB", "k_strideM", "k_strideH"])
+    bw_kernel.writeTensor(value, "value", ["v_strideB", "v_strideM", "v_strideH"])
+    bw_kernel.writeTensor(lse, "logsumexp", ["lse_strideB", "lse_strideH"])
+    bw_kernel.writeTensor(out, "output", ["o_strideB", "o_strideM", "o_strideH"])
+    bw_kernel.writeTensor(grad_out, "grad_output", ["gO_strideB", "gO_strideM", "gO_strideH"])
+    bw_kernel.writeTensor(delta, "delta", ["delta_strideB", "delta_strideH"])
+
+    if bw_kernel.read() != "OK":
+        print("Got unexpected output")
+        print(bw_kernel.subp.communicate()[0])
+        sys.exit(0)
+
+    # Read kernel output
+    gQ = bw_kernel.readTensor("grad_query", ["gQ_strideB", "gQ_strideM", "gQ_strideH"], query.shape).float()
+    gK = bw_kernel.readTensor("grad_key", ["gK_strideB", "gK_strideM", "gK_strideH"], key.shape).float()
+    gV = bw_kernel.readTensor("grad_value", ["gV_strideB", "gV_strideM", "gV_strideH"], value.shape).float()
+    runtime_ms = float(bw_kernel.readNamed("runtime_ms"))
+
+float_ops = B * H * sum([
+    # att = Q @ K.transpose
+    Mq * Mkv * K * 2,
+    # att @ dO
+    Mkv * Mq * Kv * 2,
+    # dov = dO @ V
+    Mq * Kv * Mkv * 2,
+    # dov @ K
+    Mq * K * Mkv * 2,
+    # dov @ Q
+    Mq * K * Mkv * 2,
+])
+if causal:
+    float_ops //= 2
+
+print(f"""
+Fused multi-head attention - backward
+    batch_size={B}
+    num_queries={Mq}
+    num_keys={Mkv}
+    num_heads={H}
+    head_dim={K}
+    head_dim_value={Kv}
+
+    Correctness:
+        grad_query: {"PASS" if torch.allclose(gQ, gQr, rtol=RTOL, atol=ATOL) else "FAIL"} (delta: {(gQ - gQr).abs().max()})
+        grad_key:   {"PASS" if torch.allclose(gK, gKr, rtol=RTOL, atol=ATOL) else "FAIL"} (delta: {(gK - gKr).abs().max()})
+        grad_value: {"PASS" if torch.allclose(gV, gVr, rtol=RTOL, atol=ATOL) else "FAIL"} (delta: {(gV - gVr).abs().max()})
+        (atol={ATOL} / rtol={RTOL})
+    Runtime: {runtime_ms}ms ({(float_ops / (1024 ** 4)) / (runtime_ms / 1000):.4f} TFlops)
+""")
+
+assert torch.allclose(query.grad.float(), gQr, rtol=RTOL, atol=ATOL), "Reference implementation does not match PyTorch autograd!"
+assert torch.allclose(key.grad.float(), gKr, rtol=RTOL, atol=ATOL), "Reference implementation does not match PyTorch autograd!"
+assert torch.allclose(value.grad.float(), gVr, rtol=RTOL, atol=ATOL), "Reference implementation does not match PyTorch autograd!"
diff --git a/third_party/fused_multi_head_attention/fmha_grouped.h b/third_party/fused_multi_head_attention/fmha_grouped.h
new file mode 100644
index 0000000000..afc25e4340
--- /dev/null
+++ b/third_party/fused_multi_head_attention/fmha_grouped.h
@@ -0,0 +1,1023 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Grouped FMHA kernel
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/trace.h"
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+
+#include "fmha_grouped_problem_visitor.h"
+#include "gemm_kernel_utils.h"
+#include "gemm/mma_accum_lambda_iterator.h"
+#include "epilogue/epilogue_rescale_output.h"
+
+
+namespace {
+  static CUTLASS_DEVICE float atomicMaxFloat(float* addr, float value) {
+  // source: https://stackoverflow.com/a/51549250
+  return (value >= 0)
+      ? __int_as_float(atomicMax((int*)addr, __float_as_int(value)))
+      : __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
+}
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename MM0_,                           ///! Structure for computing P = Q @ K
+  typename MM1_,                           ///! Structure for computing O = P @ V
+  typename scalar_t_,
+  typename accum_t_,
+  typename output_t_,
+  typename output_accum_t_,
+  bool kKeepOutputInRF,                    ///! Whether the intermediate output from MM0_ should be kept in the register file
+  GroupScheduleMode GroupScheduleMode_     ///! Type of scheduling to perform
+>
+struct FMHAGrouped {
+public:
+  using MM0 = MM0_;
+  using MM1 = MM1_;
+
+  using scalar_t = scalar_t_;
+  using accum_t = accum_t_;
+  using output_t = output_t_;
+  using output_accum_t = output_accum_t_;
+
+  static GroupScheduleMode const kGroupScheduleMode = GroupScheduleMode_;
+
+  static constexpr bool kNeedsOutputAccumulatorBuffer = !kKeepOutputInRF &&
+      !cutlass::platform::is_same<output_accum_t, output_t>::value;
+
+  // Parameters to satisfy BaseGrouped
+  using ElementA = scalar_t;
+  using ElementB = scalar_t;
+  using ElementC = accum_t;
+  using LayoutA = typename MM0::LayoutA;
+  using LayoutB = typename MM0::ElementB;
+  using LayoutC = typename MM1::ElementC;
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+  static int const kAlignmentA = MM0::kAlignmentA;
+  static int const kAlignmentB = MM0::kAlignmentB;
+  static int const kAlignmentC = 1;
+  using Mma = typename MM1::Mma;
+  using EpilogueOutputOp = typename MM1::EpilogueOutputOp;
+  using ThreadblockSwizzle = void;
+  using Operator = typename MM1::Operator;
+  using WarpShape = typename MM1::WarpShape;
+  using InstructionShape = typename MM1::InstructionShape;
+
+  using ElementQ = scalar_t;
+  using ElementK = scalar_t;
+  using ElementP = accum_t;
+  using ElementV = scalar_t;
+  using ElementO = output_t;
+  using ElementOAccum = output_accum_t;
+  using ElementAccumulator = accum_t;
+
+  using LayoutQ = typename MM0::LayoutA;
+  using LayoutK = typename MM0::LayoutB;
+  using LayoutP = typename MM0::LayoutC;
+  using LayoutV = typename MM1::LayoutB;
+  using LayoutO = typename MM1::LayoutC;
+
+  static bool const kPreloadV = (MM1::Mma::ArchTag::kMinComputeCapability >= 80 &&
+                                 cutlass::sizeof_bits<ElementV>::value == 16);
+
+  static int const kAlignmentQ = MM0::kAlignmentA;
+  static int const kAlignmentK = MM0::kAlignmentB;
+  static int const kAlignmentV = 1;
+
+  using ThreadblockShape = typename MM0::ThreadblockShape;
+
+  static int const kQueriesPerBlock = ThreadblockShape::kM;
+  static int const kKeysPerBlock = ThreadblockShape::kN;
+
+  static constexpr bool kSupportsDropout = false;
+  static constexpr bool kSupportsBias = false;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename MM1::WarpCount;
+  static int const kThreadsPerWarp = 32;
+  static int const kThreadCount = kThreadsPerWarp * WarpCount::kCount;
+
+  static constexpr int kNumWarpsPerBlock =
+    kQueriesPerBlock * kKeysPerBlock / (kThreadsPerWarp * kThreadsPerWarp);
+
+  using ProblemVisitor = FMHAGroupedProblemVisitor<
+                            ThreadblockShape,
+                            kGroupScheduleMode,
+                            kThreadCount,
+                            kThreadCount>;
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord *problem_sizes0{nullptr};
+    GemmCoord *problem_sizes1{nullptr};
+
+    int problem_count{0};
+    int threadblock_count{0};
+
+    ElementQ ** ptr_Q{nullptr};
+    ElementK ** ptr_K{nullptr};
+    ElementP ** ptr_P{nullptr};
+    ElementV ** ptr_V{nullptr};
+    ElementO ** ptr_O{nullptr};
+    ElementOAccum ** ptr_O_accum{nullptr};
+
+    typename LayoutQ::Stride::LongIndex *ldq{nullptr};
+    typename LayoutK::Stride::LongIndex *ldk{nullptr};
+    typename LayoutP::Stride::LongIndex *ldv{nullptr};
+    typename LayoutO::Stride::LongIndex *ldo{nullptr};
+
+    // Whether causal masking is to be performed
+    bool causal{false};
+
+    // Scale
+    ElementAccumulator scale{0};
+
+    // Only used by device-level operator
+    GemmCoord *host_problem_sizes{nullptr};
+
+    //
+    // Methods
+    //
+  
+      /// Default ctor
+    Arguments() = default;
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord *problem_sizes0,
+      GemmCoord *problem_sizes1,
+      int problem_count,
+      int threadblock_count,
+      ElementQ ** ptr_Q,
+      ElementK ** ptr_K,
+      ElementP ** ptr_P,
+      ElementV ** ptr_V,
+      ElementO ** ptr_O,
+      ElementOAccum ** ptr_O_accum,
+      typename LayoutQ::Stride::LongIndex *ldq,
+      typename LayoutK::Stride::LongIndex *ldk,
+      typename LayoutP::Stride::LongIndex *ldp,
+      typename LayoutV::Stride::LongIndex *ldv,
+      typename LayoutO::Stride::LongIndex *ldo,
+      bool causal,
+      ElementAccumulator scale,
+      GemmCoord *host_problem_sizes=nullptr
+    ):
+      problem_sizes0(problem_sizes0),
+      problem_sizes1(problem_sizes1),
+      problem_count(problem_count),
+      threadblock_count(threadblock_count),
+      ptr_Q(ptr_Q),
+      ptr_K(ptr_K),
+      ptr_P(ptr_P),
+      ptr_V(ptr_V),
+      ptr_O(ptr_O),
+      ptr_O_accum(kNeedsOutputAccumulatorBuffer ? ptr_O_accum : (accum_t**)ptr_O),
+      ldq(ldq),
+      ldk(ldk),
+      ldv(ldv),
+      ldo(ldo),
+      causal(causal),
+      scale(scale),
+      host_problem_sizes(host_problem_sizes)
+    {
+
+    }
+
+    bool __host__ check_supported() {
+      CHECK_ALIGNED_PTR(ptr_Q, kAlignmentQ);
+      CHECK_ALIGNED_PTR(ptr_K, kAlignmentK);
+      CHECK_ALIGNED_PTR(ptr_V, kAlignmentV);
+      XFORMERS_CHECK(ldq % kAlignmentQ == 0, "query is not correctly aligned");
+      XFORMERS_CHECK(ldk % kAlignmentK == 0, "key is not correctly aligned");
+      XFORMERS_CHECK(ldv % kAlignmentV == 0, "value is not correctly aligned");
+      return true;
+    }
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+
+    typename ProblemVisitor::Params problem_visitor;
+    int threadblock_count;
+
+    ElementQ ** ptr_Q;
+    ElementK ** ptr_K;
+    ElementP ** ptr_P;
+    ElementV ** ptr_V;
+    ElementO ** ptr_O;
+    ElementOAccum ** ptr_O_accum;
+
+    typename LayoutQ::Stride::LongIndex *ldq;
+    typename LayoutK::Stride::LongIndex *ldk;
+    typename LayoutP::Stride::LongIndex *ldv;
+    typename LayoutO::Stride::LongIndex *ldo;
+
+    ElementAccumulator scale;
+    bool causal;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params():
+      ptr_Q(nullptr),
+      ptr_K(nullptr),
+      ptr_P(nullptr),
+      ptr_V(nullptr),
+      ptr_O(nullptr),
+      ptr_O_accum(nullptr),
+      ldq(nullptr),
+      ldk(nullptr),
+      ldv(nullptr),
+      ldo(nullptr),
+      causal(false),
+      scale(0)
+    { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Arguments const &args,
+          void *workspace = nullptr,
+          int tile_count = 0):
+      problem_visitor(args.problem_sizes0, args.problem_sizes1, args.problem_count, workspace, tile_count),
+      threadblock_count(args.threadblock_count),
+      ptr_Q(args.ptr_Q),
+      ptr_K(args.ptr_K),
+      ptr_P(args.ptr_P),
+      ptr_V(args.ptr_V),
+      ptr_O(args.ptr_O),
+      ptr_O_accum(kNeedsOutputAccumulatorBuffer ? args.ptr_O_accum : (accum_t**)args.ptr_O),
+      ldq(args.ldq),
+      ldk(args.ldk),
+      ldv(args.ldv),
+      ldo(args.ldo),
+      causal(args.causal),
+      scale(args.scale)
+    { 
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    void update(
+      Arguments const &args,
+      void *workspace = nullptr,
+      int tile_count = 0) {
+
+      problem_visitor = typename ProblemVisitor::Params(args.problem_sizes0,
+                                                        args.problem_sizes1,
+                                                        args.problem_count,
+                                                        workspace, tile_count);
+      threadblock_count = args.threadblock_count;
+      ptr_Q = args.ptr_Q;
+      ptr_K = args.ptr_K;
+      ptr_P = args.ptr_P;
+      ptr_V = args.ptr_V;
+      ptr_O = args.ptr_O;
+      ptr_O_accum = kNeedsOutputAccumulatorBuffer ? args.ptr_O_accum : (accum_t**)args.ptr_O;
+      ldq = args.ldq;
+      ldk = args.ldk;
+      ldv = args.ldv;
+      ldo = args.ldo;
+      causal = args.causal;
+      scale = args.scale;
+    }
+  };
+
+  // Shared storage - depends on kernel params
+  struct ScalingCoefs {
+    cutlass::Array<ElementAccumulator, kQueriesPerBlock> m_prime;
+    cutlass::Array<ElementAccumulator, kQueriesPerBlock> s_prime;
+    cutlass::Array<ElementAccumulator, kQueriesPerBlock> mi;
+    cutlass::Array<ElementAccumulator, kQueriesPerBlock> out_rescale;
+    cutlass::Array<ElementAccumulator, kQueriesPerBlock * MM0::MmaCore::WarpCount::kN>
+        addition_storage;
+  };
+
+  struct SharedStorageEpilogueAtEnd : ScalingCoefs {
+    struct SharedStorageAfterMM0 {
+      // Everything here might be overwritten during MM0
+      typename MM0::AccumulatorSharedStorage si;
+      typename MM1::Mma::SharedStorage mm1;
+    };
+
+    union {
+      typename MM0::Mma::SharedStorage mm0;
+      SharedStorageAfterMM0 after_mm0;
+      typename MM1::DefaultEpilogue::SharedStorage epilogue;
+    };
+
+    CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage&
+    epilogue_shared_storage() {
+      return epilogue;
+    }
+
+    // ProblemVisitor shared storage can't be overlapped with others
+    typename ProblemVisitor::SharedStorage problem_visitor;
+  };
+
+  struct SharedStorageEpilogueInLoop : ScalingCoefs {
+    struct SharedStorageAfterMM0 {
+      // Everything here might be overwritten during MM0
+      typename MM0::AccumulatorSharedStorage si;
+      typename MM1::Mma::SharedStorage mm1;
+      typename MM1::DefaultEpilogue::SharedStorage epilogue;
+    };
+
+    union {
+      typename MM0::Mma::SharedStorage mm0;
+      SharedStorageAfterMM0 after_mm0;
+    };
+
+    CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage&
+    epilogue_shared_storage() {
+      return after_mm0.epilogue;
+    }
+
+    // ProblemVisitor shared storage can't be overlapped with others
+    typename ProblemVisitor::SharedStorage problem_visitor;
+  };
+
+  using SharedStorage = typename cutlass::platform::conditional<
+      kKeepOutputInRF,
+      SharedStorageEpilogueAtEnd,
+      SharedStorageEpilogueInLoop>::type;
+
+private:
+
+  // Parameters to be used by an individual tile
+  struct TileParams {
+
+    CUTLASS_HOST_DEVICE
+    static int query_start(int threadblock_idx) {
+      return threadblock_idx * kQueriesPerBlock;
+    }
+
+    // Returns whether this threadblock computes within the number of queries,
+    // which is determined by the M dimension of problem 0
+    CUTLASS_HOST_DEVICE
+    static bool can_compute(int threadblock_idx, const GemmCoord& problem_size0) {
+      return query_start(threadblock_idx) < problem_size0.m();
+    }
+
+    CUTLASS_HOST_DEVICE
+    static int num_queries(int threadblock_idx, const GemmCoord& problem_size0) {
+      return problem_size0.m() - query_start(threadblock_idx);
+    }
+
+    CUTLASS_HOST_DEVICE
+    static int num_keys(int threadblock_idx, const GemmCoord& problem_size0, bool causal) {
+      int nk = problem_size0.n();
+      if (causal) {
+        nk = cutlass::fast_min(int32_t(query_start(threadblock_idx) + kQueriesPerBlock), nk);
+      }
+      return nk;
+    }
+
+  };
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  FMHAGrouped() { }
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(cutlass::gemm::GemmCoord const & problem_size) {
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return Status::kSuccess;
+  }
+
+  static CUTLASS_DEVICE int16_t thread_id() {
+    return threadIdx.x;
+  }
+
+  static CUTLASS_DEVICE int8_t warp_id() {
+    return threadIdx.x / kThreadsPerWarp;
+  }
+
+  static CUTLASS_DEVICE int8_t lane_id() {
+    return threadIdx.x % kThreadsPerWarp;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+    auto& m_prime = shared_storage.m_prime;
+    auto& s_prime = shared_storage.s_prime;
+    [[maybe_unused]] auto& si = shared_storage.after_mm0.si;
+    auto& mi = shared_storage.mi;
+    auto& out_rescale = shared_storage.out_rescale;
+
+    ProblemVisitor problem_visitor(
+      params.problem_visitor,
+      shared_storage.problem_visitor,
+      blockIdx.x);
+
+    // Outer 'persistent' loop to iterate over tiles
+    while (problem_visitor.next_tile()) {
+
+      GemmCoord problem_size0 = problem_visitor.problem_size0();
+      GemmCoord problem_size1 = problem_visitor.problem_size1();
+      const int32_t threadblock_idx = int32_t(problem_visitor.threadblock_idx());
+
+      if (!TileParams::can_compute(threadblock_idx, problem_size0)) {
+        problem_visitor.advance(gridDim.x);
+        continue;
+      }
+
+      const int32_t problem_idx = problem_visitor.problem_index();
+
+      if (thread_id() < kQueriesPerBlock) {
+        s_prime[thread_id()] = ElementAccumulator(0);
+        out_rescale[thread_id()] = accum_t(1.0);
+        m_prime[thread_id()] =
+            -cutlass::platform::numeric_limits<ElementAccumulator>::infinity();
+        mi[thread_id()] = -cutlass::platform::numeric_limits<ElementAccumulator>::infinity();
+      }
+
+      ElementO *ptr_O = params.ptr_O[problem_idx]  + TileParams::query_start(threadblock_idx) * params.ldo[problem_idx];
+      ElementOAccum *ptr_O_accum = params.ptr_O_accum[problem_idx]  + TileParams::query_start(threadblock_idx) * params.ldo[problem_idx];
+      const int num_queries = TileParams::num_queries(threadblock_idx, problem_size0);
+
+      auto createOutputIter = [&](int col) -> typename MM1::OutputTileIterator {
+        using OutputTileIterator = typename MM1::OutputTileIterator;
+        return OutputTileIterator(
+            typename OutputTileIterator::Params{(int32_t)params.ldo[problem_idx]},
+            ptr_O,
+            typename OutputTileIterator::TensorCoord{
+                num_queries, problem_size1.n()},
+            thread_id(),
+            {0, col});
+      };
+
+      auto createOutputAccumIter = [&](int col) ->
+        typename MM1::OutputTileIteratorAccum {
+          using OutputTileIteratorAccum = typename MM1::OutputTileIteratorAccum;
+          return OutputTileIteratorAccum(
+              typename OutputTileIteratorAccum::Params{(int32_t)params.ldo[problem_idx]},
+              ptr_O_accum,
+              typename OutputTileIteratorAccum::TensorCoord{
+                  num_queries, problem_size1.n()},
+              thread_id(),
+              {0, col});
+        };
+
+      typename MM1::Mma::FragmentC accum_o;
+      accum_o.clear();
+
+      const int num_keys = TileParams::num_keys(threadblock_idx, problem_size0, params.causal);
+
+      for (int32_t iter_key_start = 0; iter_key_start < num_keys;
+           iter_key_start += kKeysPerBlock) {
+        int32_t problem_size_0_m =
+            cutlass::fast_min((int32_t)kQueriesPerBlock, num_queries);
+        int32_t problem_size_0_n = cutlass::fast_min(
+            (int32_t)kKeysPerBlock, num_keys - iter_key_start);
+        int32_t const& problem_size_0_k = problem_size0.k();
+        int32_t const& problem_size_1_n = problem_size1.n();
+        int32_t const& problem_size_1_k = problem_size_0_n;
+
+        auto prologueV = [&](int blockN) {
+          typename MM1::Mma::IteratorB iterator_V(
+              typename MM1::IteratorB::Params{typename MM1::LayoutB(params.ldv[problem_idx])},
+              params.ptr_V[problem_idx] + iter_key_start * params.ldv[problem_idx],
+              {problem_size_1_k, problem_size_1_n},
+              thread_id(),
+              cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
+
+          MM1::Mma::prologue(
+              shared_storage.after_mm0.mm1,
+              iterator_V,
+              thread_id(),
+              problem_size_1_k);
+        };
+
+        __syncthreads(); // Need to have shared memory initialized, and `m_prime`
+                         // updated from end of prev iter
+
+        //
+        // MATMUL: Q.K_t
+        //
+        // Computes the block-matrix product of:
+        // (a) query[query_start:query_end, :]
+        // with
+        // (b) key[iter_key_start:iter_key_start + kKeysPerBlock]
+        // and stores that into `shared_storage.si`
+        //
+
+        ElementQ *ptr_Q = params.ptr_Q[problem_idx] + TileParams::query_start(threadblock_idx) * params.ldq[problem_idx];
+
+        // Construct iterators to A and B operands
+        typename MM0::IteratorA iterator_A(
+          typename MM0::IteratorA::Params(
+              typename MM0::MmaCore::LayoutA(params.ldq[problem_idx])),
+          ptr_Q,
+          {problem_size_0_m, problem_size_0_k},
+          thread_id(),
+          {0, 0});
+
+        typename MM0::IteratorB iterator_B(
+            typename MM0::IteratorB::Params(
+                typename MM0::MmaCore::LayoutB(params.ldk[problem_idx])),
+            params.ptr_K[problem_idx] + iter_key_start * params.ldk[problem_idx],
+            {problem_size_0_k, problem_size_0_n},
+            thread_id(),
+            {0, 0});
+
+        // Construct thread-scoped matrix multiply
+        typename MM0::Mma mma(
+            shared_storage.mm0, thread_id(), warp_id(), lane_id());
+
+        typename MM0::Mma::FragmentC accum;
+
+        accum.clear();
+
+        auto gemm_k_iterations =
+            (problem_size_0_k + MM0::Mma::Shape::kK - 1) / MM0::Mma::Shape::kK;
+
+        // Compute threadblock-scoped matrix multiply-add
+        mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
+        __syncthreads();
+
+        if (kPreloadV) {
+          prologueV(0);
+        } else {
+          MM1::Mma::drain_cp_asyncs();
+        }
+
+        typename MM0::Mma::Operator::IteratorC::TensorCoord
+          iteratorC_tile_offset = {
+              (warp_id() % MM0::Mma::WarpCount::kM),
+              (warp_id() / MM0::Mma::WarpCount::kM)
+            };
+
+        // Mask out last if causal
+        if (params.causal && num_keys - iter_key_start <= kKeysPerBlock) {
+          auto lane_offset = MM0::AccumLambdaIterator::get_lane_offset(
+              lane_id(), warp_id(), iteratorC_tile_offset);
+          int32_t last_col;
+          MM0::AccumLambdaIterator::iterateRows(
+              lane_offset,
+              [&](int accum_m) {
+                last_col = TileParams::query_start(threadblock_idx) + accum_m - iter_key_start;
+              },
+              [&](int accum_m, int accum_n, int idx) {
+                if (accum_n > last_col) {
+                  accum[idx] =
+                      -cutlass::platform::numeric_limits<accum_t>::infinity();
+                }
+              },
+              [&](int accum_m) {});
+        }
+        // DISPATCH_BOOL(iter_key_start == 0, kIsFirst, ([&] {
+        //         DISPATCH_BOOL(
+        //             num_keys - iter_key_start >= kKeysPerBlock,
+        //             kFullColumns,
+        //             ([&] {
+        //               // Update `mi` from accum stored in registers
+        //               // Also does accum[i] <- exp(accum[i] - mi)
+        //               iterative_softmax<
+        //                   typename MM0::Mma::Operator::IteratorC,
+        //                   kFullColumns,
+        //                   kIsFirst>(
+        //                   accum_o,
+        //                   accum,
+        //                   mi,
+        //                   m_prime,
+        //                   s_prime,
+        //                   lane_id(),
+        //                   thread_id(),
+        //                   warp_id(),
+        //                   num_keys - iter_key_start,
+        //                   iteratorC_tile_offset,
+        //                   kSupportsBias ? 1.0f : params.scale);
+        //             }));
+        //       }));
+
+        // Update `mi` from accum stored in registers
+        // Also does accum[i] <- exp(accum[i] - mi)
+        iterative_softmax<typename MM0::Mma::Operator::IteratorC>(
+            accum_o,
+            accum,
+            mi,
+            m_prime,
+            s_prime,
+            out_rescale,
+            shared_storage.addition_storage,
+            lane_id(),
+            thread_id(),
+            warp_id(),
+            num_keys - iter_key_start,
+            iter_key_start == 0,
+            iteratorC_tile_offset,
+            kSupportsBias ? 1.0f : params.scale);
+
+        // Output results to shared-memory
+        int warp_idx_mn_0 = warp_id() %
+            (MM0::Mma::Base::WarpCount::kM * MM0::Mma::Base::WarpCount::kN);
+        auto output_tile_coords = cutlass::MatrixCoord{
+            warp_idx_mn_0 % MM0::Mma::Base::WarpCount::kM,
+            warp_idx_mn_0 / MM0::Mma::Base::WarpCount::kM};
+
+        MM0::B2bGemm::accumToSmem(
+            shared_storage.after_mm0.si, accum, lane_id(), output_tile_coords);
+
+        __syncthreads();
+
+        //
+        // MATMUL: Attn . V
+        // Run the matmul `attn @ V` for a block of attn and V.
+        // `attn` is read from shared memory (in `shared_storage_si`)
+        // `V` is read from global memory (with iterator_B)
+        //
+
+        const int64_t nBlockN = kKeepOutputInRF ? 1
+                                                : ceil_div(
+                                                      (int64_t)problem_size_1_n,
+                                                      int64_t(MM1::ThreadblockShape::kN));
+
+        // Iterate over the N dimension of GEMM1
+        for (int blockN = 0; blockN < nBlockN; ++blockN) {
+          int gemm_k_iterations =
+              (problem_size_1_k + MM1::Mma::Shape::kK - 1) / MM1::Mma::Shape::kK;
+
+          // Compute threadblock-scoped matrix multiply-add and store it in accum
+          // (in registers)
+          if (!kPreloadV) {
+            __syncthreads(); // we share shmem between mma and epilogue
+          }
+
+          typename MM1::Mma::IteratorB iterator_V(
+            typename MM1::IteratorB::Params{typename MM1::LayoutB(params.ldv[problem_idx])},
+            params.ptr_V[problem_idx] + iter_key_start * params.ldv[problem_idx],
+            {problem_size_1_k, problem_size_1_n},
+            thread_id(),
+            cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
+
+          typename MM1::Mma mma_pv(
+            // operand A: Pij_dropped in shared memory
+            shared_storage.after_mm0.si.accum_ref(),
+            // operand B: shared memory staging area for Vj, which is loaded
+            // from global memory
+            shared_storage.after_mm0.mm1.operand_B_ref(),
+            (int)thread_id(),
+            (int)warp_id(),
+            (int)lane_id());
+
+          mma_pv.set_prologue_done(kPreloadV);
+          if (!kKeepOutputInRF) {
+            accum_o.clear();
+          }
+
+          mma_pv(gemm_k_iterations, accum_o, iterator_V, accum_o);
+          __syncthreads();
+
+          if (kPreloadV && !kKeepOutputInRF && blockN + 1 < nBlockN) {
+            prologueV(blockN + 1);
+          }
+
+          if (!kKeepOutputInRF) {
+            MM1::Mma::drain_cp_asyncs();
+            DISPATCH_BOOL(
+                iter_key_start == 0, kIsFirst, ([&] {
+                  DISPATCH_BOOL(
+                      (iter_key_start + kKeysPerBlock) >= num_keys,
+                      kIsLast,
+                      ([&] {
+                        using DefaultEpilogue = typename MM1::DefaultEpilogue;
+                        using DefaultOp = typename MM1::DefaultConfig::EpilogueOutputOp;
+                        using ElementCompute = typename DefaultOp::ElementCompute;
+                        using EpilogueOutputOp = typename cutlass::epilogue::
+                            thread::MemoryEfficientAttentionNormalize<
+                                typename cutlass::platform::conditional<
+                                    kIsLast::value,
+                                    output_t,
+                                    output_accum_t>::type,
+                                output_accum_t,
+                                DefaultOp::kCount,
+                                typename DefaultOp::ElementAccumulator,
+                                output_accum_t,
+                                kIsFirst::value,
+                                kIsLast::value,
+                                cutlass::Array<ElementCompute, kQueriesPerBlock>>;
+                        using Epilogue = typename cutlass::epilogue::threadblock::
+                            EpiloguePipelined<
+                                typename DefaultEpilogue::Shape,
+                                typename MM1::Mma::Operator,
+                                DefaultEpilogue::kPartitionsK,
+                                typename cutlass::platform::conditional<
+                                    kIsLast::value,
+                                    typename MM1::OutputTileIterator,
+                                    typename MM1::OutputTileIteratorAccum>::type,
+                                typename DefaultEpilogue::
+                                    AccumulatorFragmentIterator,
+                                typename DefaultEpilogue::WarpTileIterator,
+                                typename DefaultEpilogue::SharedLoadIterator,
+                                EpilogueOutputOp,
+                                typename DefaultEpilogue::Padding,
+                                DefaultEpilogue::kFragmentsPerIteration,
+                                true, // IterationsUnroll
+                                typename MM1::OutputTileIteratorAccum // Read
+                                                                      // iterator
+                                >;
+
+                        int col = blockN * MM1::Mma::Shape::kN;
+                        auto source_iter = createOutputAccumIter(col);
+                        auto dest_iter = gemm_kernel_utils::call_conditional<
+                            kIsLast::value,
+                            decltype(createOutputIter),
+                            decltype(createOutputAccumIter)>::
+                            apply(createOutputIter, createOutputAccumIter, col);
+                        EpilogueOutputOp rescale(s_prime, out_rescale);
+                        Epilogue epilogue(
+                            shared_storage.epilogue_shared_storage(),
+                            thread_id(),
+                            warp_id(),
+                            lane_id());
+                        epilogue(rescale, dest_iter, accum_o, source_iter);
+                      }));
+                }));
+            if (!kKeepOutputInRF) {
+              __syncthreads();
+            }
+          }
+        }
+         __syncthreads(); // we modify `m_prime` after
+      }
+
+      if (kKeepOutputInRF) {
+        constexpr bool kIsFirst = true;
+        constexpr bool kIsLast = true;
+        using DefaultEpilogue = typename MM1::DefaultEpilogue;
+        using DefaultOp = typename MM1::DefaultConfig::EpilogueOutputOp;
+        using ElementCompute = typename DefaultOp::ElementCompute;
+        using EpilogueOutputOp =
+            typename cutlass::epilogue::thread::MemoryEfficientAttentionNormalize<
+                output_t,       // output
+                output_accum_t, // source
+                DefaultOp::kCount,
+                typename DefaultOp::ElementAccumulator, // accum
+                output_accum_t, // compute
+                kIsFirst,
+                kIsLast,
+                cutlass::Array<ElementCompute, kQueriesPerBlock>>;
+        using Epilogue =
+            typename cutlass::epilogue::threadblock::EpiloguePipelined<
+                typename DefaultEpilogue::Shape,
+                typename MM1::Mma::Operator,
+                DefaultEpilogue::kPartitionsK,
+                typename MM1::OutputTileIterator, // destination
+                typename DefaultEpilogue::AccumulatorFragmentIterator,
+                typename DefaultEpilogue::WarpTileIterator,
+                typename DefaultEpilogue::SharedLoadIterator,
+                EpilogueOutputOp,
+                typename DefaultEpilogue::Padding,
+                DefaultEpilogue::kFragmentsPerIteration,
+                true, // IterationsUnroll
+                typename MM1::OutputTileIteratorAccum // source tile
+                >;
+        auto dest_iter = createOutputIter(0);
+        EpilogueOutputOp rescale(s_prime, out_rescale);
+        Epilogue epilogue(
+            shared_storage.epilogue_shared_storage(),
+            thread_id(),
+            warp_id(),
+            lane_id());
+        MM1::Mma::drain_cp_asyncs();
+        epilogue(rescale, dest_iter, accum_o);
+      }
+
+      // Next tile
+      problem_visitor.advance(gridDim.x);
+      __syncthreads(); // Don't start the next iteration until all threads are done using shared memory.
+    }
+  }
+
+  template <typename WarpIteratorC>
+  CUTLASS_DEVICE static void iterative_softmax(
+      typename WarpIteratorC::Fragment& frag_o, // output so far
+      typename WarpIteratorC::Fragment& frag,
+      cutlass::Array<accum_t, kQueriesPerBlock>& mi,
+      cutlass::Array<accum_t, kQueriesPerBlock>& m_prime,
+      cutlass::Array<accum_t, kQueriesPerBlock>& s_prime,
+      cutlass::Array<accum_t, kQueriesPerBlock>& out_rescale,
+      cutlass::Array<accum_t, kQueriesPerBlock * MM0::MmaCore::WarpCount::kN>&
+          addition_storage,
+      int8_t lane_id,
+      int8_t thread_id,
+      int8_t warp_id,
+      int max_col,
+      bool is_first,
+      typename WarpIteratorC::TensorCoord const& tile_offset,
+      float scaling) {
+    /* Iterates on the accumulator and corresponding position on result matrix
+
+    (1) Update `mi[r]` to the max value of the row `r`
+    (2) In a second iteration do the following:
+        (a) accum   <- exp(accum - mi)
+        (b) m_prime <- exp(m_prime - mi)
+        (c) s_prime <- s_prime * m_prime + sum(accum)
+
+    All of this is done on registers, before we store all of this
+    on shared memory for the next matmul with Value.
+    */
+    using Fragment = typename WarpIteratorC::Fragment;
+    using LambdaIterator = typename DefaultMmaAccumLambdaIterator<
+        WarpIteratorC,
+        accum_t,
+        kThreadsPerWarp>::Iterator;
+    // Convert to `accum_t` (rather than double)
+    constexpr float kLog2e = 1.4426950408889634074; // log_2(e) = M_LOG2E
+
+    static_assert(kQueriesPerBlock % kNumWarpsPerBlock == 0, "");
+    static constexpr int kLinesPerWarp = kQueriesPerBlock / kNumWarpsPerBlock;
+
+    frag = cutlass::multiplies<Fragment>()(scaling * kLog2e, frag);
+
+    auto lane_offset =
+        LambdaIterator::get_lane_offset(lane_id, warp_id, tile_offset);
+
+    // First update `mi` to the max per-row
+    {
+      accum_t max;
+      LambdaIterator::iterateRows(
+          lane_offset,
+          [&](int accum_m) {
+            max = -cutlass::platform::numeric_limits<accum_t>::infinity();
+          },
+          [&](int accum_m, int accum_n, int idx) {
+            if (accum_n < max_col) {
+              max = cutlass::fast_max(max, frag[idx]);
+            }
+          },
+          [&](int accum_m) {
+            // Having 4x atomicMax seems faster than reduce within warp
+            // first...
+            atomicMaxFloat(&mi[accum_m], max);
+          });
+    }
+
+    // Make sure we all share the update values for `mi`
+    __syncthreads();
+
+    // Doing this `exp` is quite expensive. Let's
+    // split it across the warps
+    bool restore_mi_to_minus_inf = false;
+    if (lane_id < kLinesPerWarp) {
+      int id = warp_id * kLinesPerWarp + lane_id;
+      auto m_prime_id = m_prime[id];
+      auto mi_id = mi[id];
+      bool changed = m_prime_id < mi_id; // `false` if both are -inf
+      if (changed) {
+        auto m_prime_exp = exp2f(m_prime_id - mi_id);
+        out_rescale[id] = m_prime_exp;
+        s_prime[id] *= m_prime_exp;
+      } else {
+        // Only when bias is enabled, it's possible that all the first values
+        // of attention are masked to `-inf`. In that case we want to avoid
+        // `nan = exp2f(-inf - (-inf))` so we temporarily set `mi` to 0
+        if (kSupportsBias &&
+            mi_id == -cutlass::platform::numeric_limits<accum_t>::infinity()) {
+          restore_mi_to_minus_inf = true;
+          mi[id] = 0.0f;
+        }
+        out_rescale[id] = 1.0f;
+      }
+    }
+    __syncthreads(); // Update output fragments
+    if (kKeepOutputInRF && !is_first) {
+      accum_t line_rescale;
+      LambdaIterator::iterateRows(
+          lane_offset,
+          [&](int accum_m) { line_rescale = out_rescale[accum_m]; },
+          [&](int accum_m, int accum_n, int idx) {
+            frag_o[idx] = frag_o[idx] * line_rescale;
+          },
+          [&](int accum_m) {});
+    }
+    // Update accum_m, accum_n, ...
+    {
+      accum_t mi_row, total_row;
+      LambdaIterator::iterateRows(
+          lane_offset,
+          [&](int accum_m) { mi_row = mi[accum_m]; },
+          [&](int accum_m, int accum_n, int idx) {
+            frag[idx] =
+                (accum_n < max_col) ? exp2f(frag[idx] - mi_row) : accum_t(0.0);
+          },
+          [&](int accum_m) {});
+      LambdaIterator::iterateRows(
+          lane_offset,
+          [&](int accum_m) { total_row = 0.0; },
+          [&](int accum_m, int accum_n, int idx) { total_row += frag[idx]; },
+          [&](int accum_m) {
+            if (LambdaIterator::reduceSameRow(
+                    lane_id, total_row, [](accum_t a, accum_t b) {
+                      return a + b;
+                    })) {
+              // NOTE: we could atomically add `total_row` to `s_prime`, but
+              // it's faster (and deterministic) to avoid atomics here
+              addition_storage
+                  [accum_m + kQueriesPerBlock * tile_offset.column()] =
+                      total_row;
+            }
+          });
+    }
+
+    __syncthreads();
+    if (lane_id < kLinesPerWarp) {
+      int id = warp_id * kLinesPerWarp + lane_id;
+      accum_t total_row = s_prime[id];
+      if (restore_mi_to_minus_inf) {
+        // Restore `mi`, see above when we set `restore_mi_to_minus_inf=true`
+        mi[id] = -cutlass::platform::numeric_limits<accum_t>::infinity();
+      } else {
+        m_prime[id] = mi[id];
+      }
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < MM0::MmaCore::WarpCount::kN; ++i) {
+        total_row += addition_storage[id + kQueriesPerBlock * i];
+      }
+      s_prime[id] = total_row;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/fmha_grouped_problem_visitor.h b/third_party/fused_multi_head_attention/fmha_grouped_problem_visitor.h
new file mode 100644
index 0000000000..f88219304b
--- /dev/null
+++ b/third_party/fused_multi_head_attention/fmha_grouped_problem_visitor.h
@@ -0,0 +1,178 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Scheduler for grouped FMHA
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/gemm/kernel/grouped_problem_visitor.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+// Helper for correctly representing problem sizes in grouped kernels
+template <typename ThreadblockShape>
+struct FMHAGroupedProblemSizeHelper {
+
+  CUTLASS_HOST_DEVICE
+  static cutlass::gemm::GemmCoord grid_shape(const cutlass::gemm::GemmCoord& problem) {
+    // FMHA only partitions tiles across the M dimension.
+    return cutlass::gemm::GemmCoord(
+      ((problem.m() - 1 + ThreadblockShape::kM) / ThreadblockShape::kM), 1, 1);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static void possibly_transpose_problem(cutlass::gemm::GemmCoord& problem) {}
+
+  CUTLASS_HOST_DEVICE
+  static int32_t tile_count(const cutlass::gemm::GemmCoord& grid) {
+    return grid.m() * grid.n();
+  }
+};
+
+} // namespace detail
+
+/// Visitor class to abstract away the algorithm for iterating over tiles
+template <typename ThreadblockShape,
+          GroupScheduleMode GroupScheduleMode_,
+          int PrefetchTileCount,
+          int ThreadCount,
+          bool Transposed = false>
+struct FMHAGroupedProblemVisitor : public GroupedProblemVisitor<
+                                            detail::FMHAGroupedProblemSizeHelper<ThreadblockShape>,
+                                            ThreadblockShape,
+                                            GroupScheduleMode_,
+                                            PrefetchTileCount,
+                                            ThreadCount> {
+
+  using ProblemSizeHelper = detail::FMHAGroupedProblemSizeHelper<ThreadblockShape>;
+  using Base = GroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape, GroupScheduleMode_, PrefetchTileCount, ThreadCount>;
+  using BaseParams = typename Base::Params;
+  using SharedStorage = typename Base::SharedStorage;
+
+  cutlass::gemm::GemmCoord const *problem_sizes0;
+  cutlass::gemm::GemmCoord const *problem_sizes1;
+
+  struct Params {
+    cutlass::gemm::GemmCoord const *problem_sizes0;
+    cutlass::gemm::GemmCoord const *problem_sizes1;
+    int32_t                         problem_count;
+    void const                     *workspace;
+    int32_t                         tile_count;
+
+    //
+    // Methods
+    //
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Params(): problem_sizes0(nullptr), problem_sizes1(nullptr),
+              problem_count(0), workspace(nullptr), tile_count(0) { }
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const *problem_sizes0,
+      cutlass::gemm::GemmCoord const *problem_sizes1,
+      int32_t                         problem_count,
+      void const                     *workspace = nullptr,
+      int32_t                         tile_count = 0
+    ):
+      problem_sizes0(problem_sizes0),
+      problem_sizes1(problem_sizes1),
+      problem_count(problem_count),
+      workspace(workspace),
+      tile_count(tile_count)
+    {}
+
+    /// Convert the FMHA-specific parameters to those used by the base class
+    CUTLASS_HOST_DEVICE
+    BaseParams to_base() const {
+        return BaseParams(// Set problem_sizes as problem_sizes1 because these determine
+                          // shape of the final output of FMHA
+                          problem_sizes1,
+                          problem_count,
+                          workspace,
+                          tile_count);
+    }
+
+  };
+
+  //
+  // Methods
+  //
+  CUTLASS_DEVICE
+  FMHAGroupedProblemVisitor(
+    Params const &params_,
+    SharedStorage &shared_storage_, 
+    int32_t block_idx
+  ): Base (
+        params_.to_base(),
+        shared_storage_, block_idx),
+     problem_sizes0(params_.problem_sizes0),
+     problem_sizes1(params_.problem_sizes1)
+  {}
+
+  /// Returns the problem size 0 for the current problem
+  CUTLASS_HOST_DEVICE
+  cutlass::gemm::GemmCoord problem_size0() const {
+    GemmCoord problem = problem_sizes0[this->problem_idx];
+    ProblemSizeHelper::possibly_transpose_problem(problem);
+    return problem;
+  }
+
+  /// Returns the problem size 1 for the current problem
+  CUTLASS_HOST_DEVICE
+  cutlass::gemm::GemmCoord problem_size1() const {
+    GemmCoord problem = problem_sizes1[this->problem_idx];
+    ProblemSizeHelper::possibly_transpose_problem(problem);
+    return problem;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/fused_multi_head_attention_backward.cu b/third_party/fused_multi_head_attention/fused_multi_head_attention_backward.cu
new file mode 100644
index 0000000000..e91548875f
--- /dev/null
+++ b/third_party/fused_multi_head_attention/fused_multi_head_attention_backward.cu
@@ -0,0 +1,298 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <vector>
+#include <iostream>
+#include <fstream>
+
+#include "kernel_backward.h"
+
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/host_tensor.h"
+
+
+using Arch = cutlass::arch::Sm80;
+static constexpr int kMaxK = 128;
+
+template <typename ArchTag, typename Element, int kMaxK>
+struct DefaultKernel {
+    // Some heuristics to select the best kernel (tested on Sm60, Sm70, Sm80)
+    // NOTE: Requires quite a lot of shmem for Sm80+,
+    // so might require tweaking those manually for Sm86/Sm89
+
+    static constexpr bool kSupports64x128 =
+        ArchTag::kMinComputeCapability >= 80 ||
+        (ArchTag::kMinComputeCapability >= 70 &&
+        cutlass::sizeof_bits<Element>::value <= 16);
+    static constexpr int kBlockSizeI = kSupports64x128 && kMaxK > 64 ? 128 : 64;
+    static constexpr bool kIsHalf = cutlass::sizeof_bits<Element>::value <= 16;
+    static constexpr bool kOutputInRF = kIsHalf && kMaxK <= kBlockSizeI;
+    static constexpr bool kPreload = kIsHalf && ArchTag::kMinComputeCapability >= 80 && kOutputInRF;
+    static constexpr int kBlockSizeJ = kPreload && kMaxK > 64 ? 128 : 64;
+
+    using Kernel = AttentionBackwardKernel<
+        Arch,
+        Element,
+        true,        // kIsAligned_
+        false,       // kApplyDropout_
+        kPreload,    // kPreload_
+        kBlockSizeI, // kBlockSizeI_,
+        kBlockSizeJ, // kBlockSizeJ_,
+        kMaxK,       // kMaxK
+        false,       // kKeysQueriesAlignedToBlockSize
+        true         // kEnableSplitKeys
+    >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+template <typename T> struct TypeName;
+template <> struct TypeName<float> { static constexpr const char* Name = "f32"; };
+template <> struct TypeName<cutlass::half_t> { static constexpr const char* Name = "f16"; };
+template <> struct TypeName<cutlass::bfloat16_t> { static constexpr const char* Name = "b16"; };
+
+void readExpect(std::string const& expected) {
+    std::string read;
+    std::cin >> read;
+    if (read != expected) {
+        std::cerr << "FATAL: Read '" << read << "' but expected '" << expected << "'" << std::endl;
+        std::exit(1);
+    }
+}
+
+/// Helpers to read from stdin
+template <typename Element>
+cutlass::HostTensor<Element, cutlass::layout::RowMajor> readTensorOnDevice(std::string const& expectedName) {
+    readExpect("tensor_begin");
+    readExpect(std::string(TypeName<Element>::Name) + ":" + expectedName);
+    uint64_t len = 0;
+    std::cin >> len;
+    readExpect("file");
+    std::string filename;
+    std::cin >> filename;
+
+    cutlass::HostTensor<Element, cutlass::layout::RowMajor> tensor({int64_t(1), int64_t(len / sizeof(Element))});
+    uint8_t* data = (uint8_t*)tensor.host_data();
+
+    std::fstream myFile(filename, std::ios::in | std::ios::binary );
+    myFile.read((char*)data, len);
+    readExpect("tensor_end");
+    tensor.sync_device();
+    return tensor;
+}
+
+int64_t readInt64(std::string const& expectedName) {
+    readExpect(expectedName);
+    int64_t s = 0;
+    std::cin >> s;
+    return s;
+}
+
+float readFloat(std::string const& expectedName) {
+    readExpect(expectedName);
+    float s = 0;
+    std::cin >> s;
+    return s;
+}
+
+// Writing
+template <typename Element>
+void writeTensor(std::string const& name, cutlass::HostTensor<Element, cutlass::layout::RowMajor>& tensor) {
+    tensor.sync_host(); // device->host
+    size_t u8len = tensor.size() * sizeof(Element);
+
+    // Python is expected to provide a file name to write to
+    readExpect("tmpfile");
+    std::string tmpfile;
+    std::cin >> tmpfile;
+
+    uint8_t* data = (uint8_t*)tensor.host_data();
+    std::fstream myFile(tmpfile, std::ios::out | std::ios::binary );
+    myFile.write((char*)data, u8len);
+    myFile.close();
+
+    std::cout << "tensor_begin " << TypeName<Element>::Name << ":" << name << " ";
+    std::cout << u8len << " file " << tmpfile << " tensor_end" << std::endl;
+}
+
+void writeInt64(std::string const& name, int64_t value) {
+    std::cout << name << " " << value << std::endl;
+}
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Element>
+int runKernel() {
+    using Kernel = typename DefaultKernel<Arch, Element, kMaxK>::Kernel;
+
+#define READ_I64(NAME) p.NAME = (decltype(p.NAME))readInt64(#NAME)
+#define READ_TENSOR_AND_STRIDES_BMH(DT, NAME, NAME_XS) \
+    auto storage##NAME = readTensorOnDevice<DT>(#NAME); \
+    p.NAME##_ptr = storage##NAME.device_data(); \
+    READ_I64(NAME_XS##_strideB); \
+    READ_I64(NAME_XS##_strideM); \
+    READ_I64(NAME_XS##_strideH);
+
+#define CUDA_CHECK(FN) { \
+    auto cudaError = FN; \
+    if (cudaError != cudaSuccess) { \
+        std::cerr << "FATAL: " #FN " failed: " << cudaGetErrorString(cudaError) << std::endl; \
+        return -1; \
+    } \
+}
+
+    typename Kernel::Params p;
+    p.scale = readFloat("scale");
+    READ_I64(head_dim);
+    READ_I64(head_dim_value);
+    READ_I64(num_queries);
+    READ_I64(num_keys);
+    READ_I64(num_heads);
+    READ_I64(custom_mask_type);
+    READ_I64(num_batches);
+    int64_t repeat_count = readInt64("repeat_count");
+    READ_I64(num_splits_key);
+
+    READ_TENSOR_AND_STRIDES_BMH(Element, query, q);
+    READ_TENSOR_AND_STRIDES_BMH(Element, key, k);
+    READ_TENSOR_AND_STRIDES_BMH(Element, value, v);
+    auto lse = readTensorOnDevice<typename Kernel::lse_scalar_t>("logsumexp");
+    p.logsumexp_ptr = lse.device_data();
+    p.lse_strideB = readInt64("lse_strideB");
+    p.lse_strideH = readInt64("lse_strideH");
+
+    // output
+    auto stOutput = readTensorOnDevice<Element>("output");
+    p.output_ptr = stOutput.device_data();
+    READ_I64(o_strideB);
+    auto o_strideM = readInt64("o_strideM");
+    if (o_strideM != p.o_strideM()) {
+        std::cerr << "Invalid `o_strideM`: " << o_strideM << " - expected " << p.o_strideM();
+        return 2;
+    }
+    READ_I64(o_strideH);
+
+    READ_TENSOR_AND_STRIDES_BMH(Element, grad_output, gO);
+
+    auto stDelta = readTensorOnDevice<typename Kernel::accum_t>("delta");
+    p.delta_ptr = stDelta.device_data();
+    READ_I64(delta_strideB);
+    READ_I64(delta_strideH);
+
+    // Allocate workspace
+    if (p.workspace_size()) {
+        cudaMalloc(&p.workspace, p.workspace_size());
+    }
+
+    // Allocate outputs in BMHK format
+    p.gQKV_strideM_multiplier = 1;
+    p.gQ_strideH = p.head_dim;
+    p.gQ_strideB = p.gQ_strideM() * p.num_queries;
+    p.gK_strideH = p.head_dim;
+    p.gK_strideB = p.gK_strideM() * p.num_keys;
+    p.gV_strideH = p.head_dim_value;
+    p.gV_strideB = p.gV_strideM() * p.num_keys;
+
+    cutlass::HostTensor<Element, cutlass::layout::RowMajor> gQ({int64_t(1), p.gQ_strideB * p.num_batches});
+    cutlass::HostTensor<Element, cutlass::layout::RowMajor> gK({int64_t(1), p.gK_strideB * p.num_batches});
+    cutlass::HostTensor<Element, cutlass::layout::RowMajor> gV({int64_t(1), p.gV_strideB * p.num_batches});
+    p.grad_query_ptr = gQ.device_data();
+    p.grad_key_ptr = gK.device_data();
+    p.grad_value_ptr = gV.device_data();
+
+    if (!Kernel::check_supported(p)) {
+      std::cerr << "FATAL: Kernel does not support these inputs" << std::endl;
+      return 2;
+    }
+
+    // Run kernel
+    cudaDeviceSynchronize();
+    auto kernel_fn = attention_kernel_backward_batched_impl<Kernel>;
+    size_t smem_bytes = sizeof(typename Kernel::SharedStorage);
+    CUDA_CHECK(cudaFuncSetAttribute(kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, int(smem_bytes)));
+    kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes>>>(p);
+
+    // Write outputs
+    std::cout << "OK ";
+    writeTensor("grad_query", gQ);
+    writeInt64("gQ_strideB", p.gQ_strideB);
+    writeInt64("gQ_strideM", p.gQ_strideM());
+    writeInt64("gQ_strideH", p.gQ_strideH);
+    writeTensor("grad_key", gK);
+    writeInt64("gK_strideB", p.gK_strideB);
+    writeInt64("gK_strideM", p.gK_strideM());
+    writeInt64("gK_strideH", p.gK_strideH);
+    writeTensor("grad_value", gV);
+    writeInt64("gV_strideB", p.gV_strideB);
+    writeInt64("gV_strideM", p.gV_strideM());
+    writeInt64("gV_strideH", p.gV_strideH);
+
+    // Timing
+    cudaEvent_t events[2];
+    for (auto & event : events) {
+      CUDA_CHECK(cudaEventCreate(&event));
+    }
+    CUDA_CHECK(cudaEventRecord(events[0]));
+    for (int i = 0; i < repeat_count; ++i) {
+        kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes>>>(p);
+    }
+    CUDA_CHECK(cudaEventRecord(events[1]));
+    CUDA_CHECK(cudaEventSynchronize(events[1]));
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    CUDA_CHECK(cudaEventElapsedTime(&runtime_ms, events[0], events[1]));
+
+    std::cout << "runtime_ms " << runtime_ms / float(repeat_count) << std::endl;
+    return 0;
+}
+
+int main() {
+    std::ios_base::sync_with_stdio(false);
+
+    std::string dtype;
+    std::cin >> dtype;
+    std::cerr << "Running kernel with dtype: " << dtype << std::endl;
+    if (dtype == "f16") {
+        return runKernel<cutlass::half_t>();
+    } else if (dtype == "b16") {
+        return runKernel<cutlass::bfloat16_t>();
+    } else if (dtype == "f32") {
+        return runKernel<float>();
+    } else {
+        std::cerr << "FATAL: Unknown dtype: " << dtype << std::endl;
+        return 3;
+    }
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/fused_multihead_attention_fixed_seqlen.cu b/third_party/fused_multi_head_attention/fused_multihead_attention_fixed_seqlen.cu
new file mode 100644
index 0000000000..5dad08d29e
--- /dev/null
+++ b/third_party/fused_multi_head_attention/fused_multihead_attention_fixed_seqlen.cu
@@ -0,0 +1,1110 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief CUTLASS Attention Example.
+
+    This workload computes a fused multi head attention.
+    Because it keeps the attention matrix in shared memory, it's both faster and
+    uses less global memory.
+
+    This is based on `"Self-Attention Does Not Need O(n^2) Memory" <http://arxiv.org/abs/2112.05682>`_,
+    and very similar to `"FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness" <https://arxiv.org/abs/2205.14135>`_.
+
+    Algorithm:
+      In short, we can compute the output incrementally in blocks of size B,
+      we just need to divide the final result by the sum of all coefficients in
+      the softmax (which we compute incrementally) with the following pseudo-code:
+
+      ```
+      s_prime = torch.zeros([num_queries, B])
+      O = torch.zeros([num_queries, head_size_v])
+      for i in range(0, K.shape[0], B):
+        si = exp((Q . K[i * B:(i+1) * B].t) * scale)
+        sum_coefs += attn_unscaled.sum(-1)
+        O  += si . V[i * B:(i+1) * B]
+      O = O / s_prime
+      ```
+
+      In practice, and for numerical stability reasons,
+      we also subtract the maximum so far (`mi`) before doing
+      the exponential. When we encounter new keys, the maximum
+      used to compute O so far (`m_prime`) can differ from the
+      current maximum, so we update O before accumulating with
+
+      ```
+      O       = O * exp(m_prime - mi)
+      m_prime = mi
+      ```
+
+    Implementation details:
+      - `si` is stored in shared memory between the 2 back to back gemms
+      - we keep and accumulate the output
+      directly in registers if we can (`head_size_v <= 128`).
+      Otherwise, we store it & accumulate in global memory (slower)
+      - blocks are parallelized across the batch dimension, the number
+      of heads, and the query sequence size
+
+
+    Examples:
+
+      # Run an attention example with default setup
+      $ ./examples/41_fused_multi_head_attention/41_fused_multi_head_attention_fixed_seqlen
+
+      # Run an attention example with custom setup
+      $ ./examples/41_fused_multi_head_attention/41_fused_multi_head_attention_fixed_seqlen --head_number=2 --batch_size=3 --head_size=32 --head_size_v=64 --seq_length=512 --seq_length_kv=1024 --causal=true
+
+      Acknowledgement: Fixed-sequence-length FMHA code was upstreamed by Meta xFormers (https://github.com/facebookresearch/xformers).
+*/
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <vector>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm_grouped.h"
+#include "cutlass/gemm/kernel/default_gemm_grouped.h"
+#include "cutlass/gemm/device/gemm_grouped.h"
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm_complex.h"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/kernel/gemm_grouped.h"
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/kernel/default_gemm_complex.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_with_visitor.h"
+#include "cutlass/fast_math.h"
+#include "kernel_forward.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Result structure
+struct Result {
+
+  double runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  //
+  // Methods
+  //
+
+  Result(
+    double runtime_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess
+  ):
+    runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+  bool error;
+  bool reference_check;
+  bool use_mask;
+  bool causal;
+
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes0;
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes1;
+
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes0_real;
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes1_real;
+
+  int alignment;
+  int head_number;
+  int batch_size;
+  int head_size;
+  int head_size_v;
+  int seq_length;
+  int seq_length_kv;
+  int iterations;
+
+  // alpha0, alpha1 and beta are fixed 
+  // in this multi-head attention example
+  float alpha0;
+  float alpha1;
+  float beta;
+
+  //
+  // Methods
+  // 
+
+  Options():
+    help(false),
+    error(false),
+    alignment(1),
+    reference_check(true),
+    head_number(12),
+    batch_size(16),
+    head_size(64),
+    head_size_v(64),
+    seq_length(1024),
+    seq_length_kv(1024),
+    use_mask(false),
+    iterations(20),
+    causal(false)
+  { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("alignment", alignment, 1);
+    cmd.get_cmd_line_argument("head_number", head_number, 12);
+    cmd.get_cmd_line_argument("batch_size", batch_size, 16);
+    cmd.get_cmd_line_argument("head_size", head_size, 64);
+    cmd.get_cmd_line_argument("head_size_v", head_size_v, head_size);
+    cmd.get_cmd_line_argument("seq_length", seq_length, 1024);
+    cmd.get_cmd_line_argument("seq_length_kv", seq_length_kv, seq_length);
+    cmd.get_cmd_line_argument("use_mask", use_mask, false);
+    cmd.get_cmd_line_argument("iterations", iterations, 20);
+    cmd.get_cmd_line_argument("reference-check", reference_check, true);
+    cmd.get_cmd_line_argument("causal", causal, true);
+
+    randomize_problems();
+
+  }
+
+  void randomize_problems() {
+
+    int problem_count = head_number * batch_size;
+
+    problem_sizes0.reserve(problem_count);
+    problem_sizes1.reserve(problem_count);
+
+    // When using mask, the original inputs are not padded
+    // and we need to save these info.
+    if (use_mask) {
+      problem_sizes0_real.reserve(problem_count);
+      problem_sizes1_real.reserve(problem_count);
+    }
+
+    for (int i = 0; i < batch_size; ++i) {
+      // problems belonging to the same batch share the same seq len
+      int m_real = seq_length;
+      int mkv_real = seq_length_kv;
+      int m = (m_real + alignment - 1) / alignment * alignment;
+      int mkv = (mkv_real + alignment - 1) / alignment * alignment;
+      int k0 = head_size;
+      int k1 = head_size_v;
+
+      for (int j = 0; j < head_number; ++j) {
+        cutlass::gemm::GemmCoord problem0(m, mkv, k0);
+        cutlass::gemm::GemmCoord problem1(m, k1, mkv);
+        problem_sizes0.push_back(problem0);
+        problem_sizes1.push_back(problem1);
+
+        if (use_mask) {
+          cutlass::gemm::GemmCoord problem0_real(m_real, mkv_real, k0);
+          cutlass::gemm::GemmCoord problem1_real(m_real, k1, mkv_real);
+          problem_sizes0_real.push_back(problem0_real);
+          problem_sizes1_real.push_back(problem1_real);
+        }
+      }
+    }
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "41_fused_multi_head_attention_fixed_seqlen\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement.\n\n"
+      << "  --head_number=<int>         Head number in multi-head attention (default: --head_number=12)\n"
+      << "  --batch_size=<int>          Batch size in multi-head attention (default: --batch_size=16)\n"
+      << "  --head_size=<int>           Head size in multi-head attention (default: --head_size=64)\n"
+      << "  --head_size_v=<int>         Head size in multi-head attention for V (default: --head_size_v=head_size)\n"
+      << "  --seq_length=<int>          Sequence length in multi-head attention for Q (default: --seq_length=1024)\n"
+      << "  --seq_length_kv=<int>       Sequence length in multi-head attention for K/V (default: --seq_length_kv=seq_length)\n"
+      << "  --use_mask=<bool>           If true, performs padding-like masking in softmax.\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n"
+      << "  --reference-check=<bool>    If true, performs reference check.\n"
+      << "  --causal=<bool>             If true, uses causal masking.\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+
+    // Number of real-valued multiply-adds 
+    int64_t fops = int64_t();
+
+    for (size_t i = 0; i < problem_sizes0.size(); ++i) {
+      auto const& problem0 = problem_sizes0[i];
+      auto const& problem1 = problem_sizes1[i];
+      for (int row = 0; row < problem0.m(); ++row) {
+        int num_cols0 = problem0.n();
+        if (causal) {
+          num_cols0 = std::min(row + 1, num_cols0);
+        }
+        // P <- Q . K_t
+        fops += 2 * num_cols0 * problem0.k();
+        // P <- exp(P - max(P))
+        fops += 2 * num_cols0;
+        // S <- sum(P)
+        fops += num_cols0 - 1;
+        // O <- P . V
+        fops += 2 * num_cols0 * problem1.n();
+        // O <- O / S
+        fops += num_cols0 * problem1.n();
+      }
+    }
+
+    return double(fops) / double(1.0e9) / runtime_s;
+  }
+};
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Attention>
+class TestbedAttention {
+public:
+
+  //
+  // Type definitions
+  //
+
+  using ElementQ = typename Attention::scalar_t;
+  using ElementK = typename Attention::scalar_t;
+  using ElementP = typename Attention::accum_t;
+  using ElementAccumulator = typename Attention::accum_t;
+  using ElementV = typename Attention::scalar_t;
+  using ElementO = typename Attention::output_t;
+
+  using ElementCompute = typename Attention::accum_t;
+
+  using ElementNorm = typename Attention::accum_t;
+  using ElementSum = typename Attention::accum_t;
+  using ElementSoftmaxCompute = typename Attention::accum_t;
+
+  using LayoutQ = cutlass::layout::RowMajor;
+  using LayoutK = cutlass::layout::ColumnMajor;
+  using LayoutP = cutlass::layout::RowMajor;
+  using LayoutV = cutlass::layout::RowMajor;
+  using LayoutO = cutlass::layout::RowMajor;
+
+  using MatrixCoord = typename LayoutP::TensorCoord;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Options & options;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_Q;
+  cutlass::Distribution::Kind init_K;
+  cutlass::Distribution::Kind init_P;
+  cutlass::Distribution::Kind init_V;
+  cutlass::Distribution::Kind init_O;
+  uint32_t seed;
+
+  cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> problem_sizes_device0;
+  cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> problem_sizes_device1;
+  cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> problem_sizes_device0_real;
+
+  std::vector<int64_t> offset_Q;
+  std::vector<int64_t> offset_K;
+  std::vector<int64_t> offset_P;
+  std::vector<int64_t> offset_V;
+  std::vector<int64_t> offset_O;
+
+  std::vector<int64_t> ldq_host;
+  std::vector<int64_t> ldk_host;
+  std::vector<int64_t> ldp_host;
+  std::vector<int64_t> ldv_host;
+  std::vector<int64_t> ldo_host;
+  std::vector<int64_t> seqlen_host;
+
+  cutlass::DeviceAllocation<int64_t> ldq;
+  cutlass::DeviceAllocation<int64_t> ldk;
+  cutlass::DeviceAllocation<int64_t> ldp;
+  cutlass::DeviceAllocation<int64_t> ldv;
+  cutlass::DeviceAllocation<int64_t> ldo;
+  cutlass::DeviceAllocation<int64_t> seqlen;
+
+  cutlass::DeviceAllocation<ElementQ> block_Q;
+  cutlass::DeviceAllocation<ElementK> block_K;
+  cutlass::DeviceAllocation<ElementP> block_P;
+  cutlass::DeviceAllocation<ElementV> block_V;
+  cutlass::DeviceAllocation<ElementO> block_O;
+  cutlass::DeviceAllocation<ElementNorm> block_Norm;
+  cutlass::DeviceAllocation<ElementSum> block_Sum;
+
+  cutlass::DeviceAllocation<int64_t> offset_P_Device;
+
+  cutlass::DeviceAllocation<ElementQ *> ptr_Q;
+  cutlass::DeviceAllocation<ElementK *> ptr_K;
+  cutlass::DeviceAllocation<ElementP *> ptr_P;
+  cutlass::DeviceAllocation<ElementV *> ptr_V;
+  cutlass::DeviceAllocation<ElementO *> ptr_O;
+
+public:
+
+  //
+  // Methods
+  //
+
+  TestbedAttention(
+    Options &options_,
+    cutlass::Distribution::Kind init_Q_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_K_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_P_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_V_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_O_ = cutlass::Distribution::Uniform,
+    uint32_t seed_ = 3080
+  ):
+    options(options_), init_Q(init_Q_), init_K(init_K_), init_P(init_P_), init_V(init_V_), init_O(init_O_), seed(seed_) { }
+
+  int problem_count() const {
+    return (options.head_number * options.batch_size);
+  }
+
+private:
+
+  /// Helper to initialize a tensor view
+  template <typename Element>
+  void initialize_tensor_(
+    Element *ptr,
+    size_t capacity, 
+    cutlass::Distribution::Kind dist_kind,
+    uint32_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      Element scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<ElementP>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 8;
+        scope_min = -8;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::device::BlockFillRandomUniform(
+        ptr, capacity, seed, scope_max, scope_min, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::device::BlockFillRandomGaussian(
+        ptr, capacity, seed, Element(), Element(0.5f));
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      // Fill with increasing elements
+      cutlass::reference::device::BlockFillSequential(
+        ptr, capacity, Element(1), Element());
+    } 
+    else {
+
+      // Fill with all 1s
+      cutlass::reference::device::BlockFillSequential(
+        ptr, capacity, Element(), Element(1));
+    }
+  }
+
+  /// Initializes data structures
+  void initialize_() {
+
+    //
+    // Set scalors for the mha example
+    //
+
+    options.alpha0 = 1.0f / sqrt(float(options.head_size));
+    options.alpha1 = 1.0f;
+    options.beta = 0;
+
+    //
+    // Choose random problem sizes
+    //
+
+    // construct a few problems of random sizes
+    srand(seed);
+
+    int64_t total_elements_Q = 0;
+    int64_t total_elements_K = 0;
+    int64_t total_elements_P = 0;
+    int64_t total_elements_V = 0;
+    int64_t total_elements_O = 0;
+
+    ldq_host.resize(problem_count());
+    ldk_host.resize(problem_count());
+    ldp_host.resize(problem_count());
+    ldv_host.resize(problem_count());
+    ldo_host.resize(problem_count());
+    seqlen_host.resize(problem_count());
+
+    // Create tensors in BMHK format, where
+    // B = batch_size
+    // M = sequence length
+    // H = num_heads
+    // K = embedding size per head
+    int64_t batch_offset_Q, batch_offset_K, batch_offset_V, batch_offset_O;
+
+    for (int32_t b = 0; b < options.batch_size; ++b) {
+      batch_offset_Q = total_elements_Q;
+      batch_offset_K = total_elements_K;
+      batch_offset_V = total_elements_V;
+      batch_offset_O = total_elements_O;
+      for (int32_t h = 0; h < options.head_number; ++h) {
+        int32_t i = h + b * options.head_number;
+
+        auto problem0 = options.problem_sizes0.at(i);
+        auto problem1 = options.problem_sizes1.at(i);
+
+        ldq_host.at(i) = LayoutQ::packed({problem0.m(), options.head_number * problem0.k()}).stride(0);
+        ldk_host.at(i) = LayoutK::packed({options.head_number * problem0.k(), problem0.n()}).stride(0);
+        ldp_host.at(i) = LayoutP::packed({problem0.m(), problem0.n()}).stride(0);
+        ldv_host.at(i) = LayoutV::packed({problem1.k(), options.head_number * problem1.n()}).stride(0);
+        ldo_host.at(i) = LayoutO::packed({problem1.m(), options.head_number * problem1.n()}).stride(0);
+
+        // m = n for attention problems.
+        seqlen_host.at(i) = problem0.m();
+
+        offset_Q.push_back(batch_offset_Q + h * problem0.k());
+        offset_K.push_back(batch_offset_K + h * problem0.k());
+        offset_P.push_back(total_elements_P);
+        offset_V.push_back(batch_offset_V + h * problem0.k());
+        offset_O.push_back(batch_offset_O + h * problem1.n());
+
+        int64_t elements_Q = problem0.m() * problem0.k();
+        int64_t elements_K = problem0.k() * problem0.n();
+        int64_t elements_P = problem0.m() * problem0.n();
+        int64_t elements_V = problem1.k() * problem1.n();
+        int64_t elements_O = problem1.m() * problem1.n();
+
+        total_elements_Q += elements_Q;
+        total_elements_K += elements_K;
+        total_elements_P += elements_P;
+        total_elements_V += elements_V;
+        total_elements_O += elements_O;
+      }
+    }
+
+    problem_sizes_device0.reset(problem_count());
+    problem_sizes_device1.reset(problem_count());
+    problem_sizes_device0.copy_from_host(options.problem_sizes0.data());
+    problem_sizes_device1.copy_from_host(options.problem_sizes1.data());
+
+    if (options.use_mask) {
+      problem_sizes_device0_real.reset(problem_count());
+      problem_sizes_device0_real.copy_from_host(options.problem_sizes0_real.data());
+    }
+
+    ldq.reset(problem_count());
+    ldk.reset(problem_count());
+    ldp.reset(problem_count());
+    ldv.reset(problem_count());
+    ldo.reset(problem_count());
+    seqlen.reset(problem_count());
+
+    ldq.copy_from_host(ldq_host.data());
+    ldk.copy_from_host(ldk_host.data());
+    ldp.copy_from_host(ldp_host.data());
+    ldv.copy_from_host(ldv_host.data());
+    ldo.copy_from_host(ldo_host.data());
+    seqlen.copy_from_host(seqlen_host.data());
+
+    //
+    // Assign pointers
+    //
+
+    block_Q.reset(total_elements_Q);
+    block_K.reset(total_elements_K);
+    block_P.reset(total_elements_P);
+    block_V.reset(total_elements_V);
+    block_O.reset(total_elements_O);
+
+    offset_P_Device.reset(problem_count());
+
+    // sync offset with device
+    cutlass::device_memory::copy_to_device(offset_P_Device.get(), offset_P.data(), offset_P.size());
+
+    std::vector<ElementQ *> ptr_Q_host(problem_count());
+    std::vector<ElementK *> ptr_K_host(problem_count());
+    std::vector<ElementP *> ptr_P_host(problem_count());
+    std::vector<ElementV *> ptr_V_host(problem_count());
+    std::vector<ElementO *> ptr_O_host(problem_count());
+    std::vector<ElementNorm *> ptr_norm_host(problem_count());
+    std::vector<ElementSum *> ptr_sum_host(problem_count());
+
+    for (int32_t i = 0; i < problem_count(); ++i) {
+      ptr_Q_host.at(i) = block_Q.get() + offset_Q.at(i);
+      ptr_K_host.at(i) = block_K.get() + offset_K.at(i);
+      ptr_P_host.at(i) = block_P.get() + offset_P.at(i);
+      ptr_V_host.at(i) = block_V.get() + offset_V.at(i);
+      ptr_O_host.at(i) = block_O.get() + offset_O.at(i);
+    }
+
+    ptr_Q.reset(problem_count());
+    ptr_Q.copy_from_host(ptr_Q_host.data());
+    
+    ptr_K.reset(problem_count());
+    ptr_K.copy_from_host(ptr_K_host.data());
+    
+    ptr_P.reset(problem_count());
+    ptr_P.copy_from_host(ptr_P_host.data());
+
+    ptr_V.reset(problem_count());
+    ptr_V.copy_from_host(ptr_V_host.data());
+
+    ptr_O.reset(problem_count());
+    ptr_O.copy_from_host(ptr_O_host.data());
+
+    //
+    // Initialize the problems of the workspace
+    //
+
+    initialize_tensor_(block_Q.get(), total_elements_Q, init_Q, seed + 1);
+    initialize_tensor_(block_K.get(), total_elements_K, init_K, seed + 2);
+    initialize_tensor_(block_V.get(), total_elements_V, init_V, seed + 3);
+
+  }
+
+  template<typename Element>
+  bool verify_tensor_(std::vector<Element> vector_Input, \
+                       std::vector<Element> vector_Input_Ref,
+                       int64_t verify_length = -1) {
+
+    int64_t size = (vector_Input.size() < vector_Input_Ref.size()) ? vector_Input.size() : vector_Input_Ref.size();
+    size = (verify_length == -1) ? size : verify_length;
+
+    // 0.05 for absolute error
+    float abs_tol = 5e-2f;
+    // 10% for relative error
+    float rel_tol = 1e-1f;
+    for (int64_t i = 0; i < size; ++i) {
+      float diff = (float)(vector_Input.at(i) - vector_Input_Ref.at(i));
+      float abs_diff = fabs(diff);
+      float abs_ref = fabs((float)vector_Input_Ref.at(i) + 1e-5f);
+      float relative_diff = abs_diff / abs_ref;
+      if ( (isnan(vector_Input_Ref.at(i)) || isnan(abs_diff) || isinf(abs_diff)) ||  (abs_diff > abs_tol && relative_diff > rel_tol)) {
+        printf("[%d/%d] diff = %f, rel_diff = %f, {computed=%f, ref=%f}.\n", int(i), int(size), abs_diff, relative_diff, (float)(vector_Input.at(i)), (float)(vector_Input_Ref.at(i)));
+        return false;
+      }
+
+    }
+
+    return true;
+  }
+
+  /// Verifies the result is a GEMM
+  bool verify_() {
+
+    bool passed = true;
+
+    for (int32_t b = 0; b < options.batch_size; ++b) {
+      int32_t i = b * options.head_number;
+      // Problem size is the same for all heads
+      cutlass::gemm::GemmCoord problem0 = options.problem_sizes0.at(b * options.head_number);
+      cutlass::gemm::GemmCoord problem1 = options.problem_sizes1.at(b * options.head_number);
+
+      MatrixCoord extent_Q{problem0.m(), problem0.k()};
+      MatrixCoord extent_K{problem0.k(), problem0.n()};
+      MatrixCoord extent_P{problem0.m(), problem0.n()};
+      MatrixCoord extent_V{problem1.k(), problem1.n()};
+      MatrixCoord extent_O{problem1.m(), problem1.n()};
+
+      LayoutO layout_O(ldo_host.at(i));
+      std::vector<ElementO> matrix_O(layout_O.capacity(extent_O));
+      cutlass::device_memory::copy_to_host(matrix_O.data(),   block_O.get() + offset_O.at(i), matrix_O.size());
+      cutlass::DeviceAllocation<ElementO>    block_Ref_O(layout_O.capacity(extent_O));
+
+      for (int32_t h = 0; h < options.head_number; ++h) {
+        i = h + b * options.head_number;
+
+        LayoutQ layout_Q(ldq_host.at(i));
+        LayoutK layout_K(ldk_host.at(i));
+        LayoutP layout_P(ldp_host.at(i));
+        LayoutV layout_V(ldv_host.at(i));
+
+        cutlass::TensorView<ElementQ, LayoutQ> view_Q(block_Q.get() + offset_Q.at(i), layout_Q, extent_Q);
+        cutlass::TensorView<ElementK, LayoutK> view_K(block_K.get() + offset_K.at(i), layout_K, extent_K);
+        cutlass::TensorView<ElementV, LayoutV> view_V(block_V.get() + offset_V.at(i), layout_V, extent_V);
+        cutlass::TensorView<ElementO, LayoutO> view_Ref_O_device(block_Ref_O.get() + offset_O.at(i) - offset_O.at(b * options.head_number), layout_O, extent_O);
+
+        cutlass::DeviceAllocation<ElementP>    block_Ref_P(layout_P.capacity(extent_P));
+        cutlass::TensorView<ElementP, LayoutP> view_Ref_P_device(block_Ref_P.get(), layout_P, extent_P);
+
+        // Reference GEMM
+        cutlass::reference::device::GemmComplex<
+            ElementQ, LayoutQ,
+            ElementK, LayoutK,
+            ElementP, LayoutP, 
+            ElementCompute, ElementAccumulator
+        >(
+          problem0,
+          ElementAccumulator(options.alpha0), 
+          view_Q,
+          Attention::MM0::Mma::kTransformA,
+          view_K,
+          Attention::MM0::Mma::kTransformB,
+          ElementAccumulator(options.beta), 
+          view_Ref_P_device, 
+          view_Ref_P_device, 
+          ElementAccumulator(0)
+        );
+
+        // Compute softmax for P. We need to explicitly compute softmax
+        // over P because softmax is fused to the second GEMM in the
+        // profiled implementation.
+        std::vector<ElementP> matrix_Ref(layout_P.capacity(extent_P));
+        cutlass::device_memory::copy_to_host(matrix_Ref.data(), block_Ref_P.get(), matrix_Ref.size());
+        cutlass::TensorView<ElementP, LayoutP> view_Ref_host(matrix_Ref.data(), layout_P, extent_P);
+        std::vector<ElementNorm> vector_Norm_Ref(problem0.m());
+        std::vector<ElementSum> vector_Sum_Ref(problem0.m());
+
+        int n_dim = options.use_mask ? options.problem_sizes0_real.at(i).n() : problem0.n();
+
+        // Compute softmax for reference matrix
+        for (int m = 0; m < problem0.m(); m++) {
+          int n_dim_row = n_dim;
+          if (options.causal) {
+            n_dim_row = std::min(m + 1, n_dim);
+          }
+          ElementSoftmaxCompute max = ElementSoftmaxCompute(view_Ref_host.ref().at({m, 0}));
+          for (int n = 1; n < n_dim_row; n++) {
+            max = std::max(max, ElementSoftmaxCompute(view_Ref_host.ref().at({m, n})));
+          }
+
+          vector_Norm_Ref.at(m) = ElementNorm(max);
+
+          ElementSoftmaxCompute sum = ElementSoftmaxCompute();
+          for (int n = 0; n < n_dim_row; n++) {
+            sum += std::exp( ElementSoftmaxCompute(view_Ref_host.ref().at({m, n})) - max );
+          }
+          ElementSoftmaxCompute inv_sum = ElementSoftmaxCompute(1.0f / sum);
+
+          vector_Sum_Ref.at(m) = ElementSum(inv_sum);
+
+          for (int n = 0; n < n_dim_row; n++) {
+            view_Ref_host.ref().at({m, n}) = ElementP(
+              std::exp( ElementSoftmaxCompute(view_Ref_host.ref().at({m, n})) - max ) * inv_sum
+            );
+          }
+          // Mask out the rest of the attention matrix
+          for (int n = n_dim_row; n < n_dim; ++n) {
+            view_Ref_host.ref().at({m, n}) = ElementP(0);
+          }
+        }
+
+        // when not using mask, problem_real and problem share the same sizes
+        if (options.use_mask) {
+          for (int m = 0; m < problem0.m(); m++) {
+            for (int n = n_dim; n < problem0.n(); n++) {
+              view_Ref_host.ref().at({m, n}) = ElementP(0);
+            }
+          }
+        }
+
+        cutlass::device_memory::copy_to_device(block_Ref_P.get(), matrix_Ref.data(), matrix_Ref.size());
+
+        // Reference GEMM
+        cutlass::reference::device::GemmComplex<
+            ElementP, LayoutP,
+            ElementV, LayoutV,
+            ElementO, LayoutO, 
+            ElementCompute, ElementAccumulator
+        >(
+          problem1,
+          ElementAccumulator(options.alpha1), 
+          view_Ref_P_device,
+          Attention::MM0::Mma::kTransformA,
+          view_V,
+          Attention::MM0::Mma::kTransformB,
+          ElementAccumulator(options.beta), 
+          view_Ref_O_device, 
+          view_Ref_O_device, 
+          ElementAccumulator(0)
+        );
+      }
+
+      // Copy to host memory
+      std::vector<ElementO> matrix_Ref_O(layout_O.capacity(extent_O));
+      cutlass::device_memory::copy_to_host(matrix_Ref_O.data(), block_Ref_O.get(), matrix_Ref_O.size());
+
+      // printf("Pb %d: \n    Q=(offset=%d, ldq=%d)\n    K=(offset=%d, ldk=%d)\n    O=(offset=%d, ldo=%d)\n",
+      //   int(i), int(offset_Q[i]), int(ldq_host[i]), int(offset_K[i]), int(ldk_host[i]), int(offset_O[i]), int(ldo_host[i]));
+  
+      bool verified_O = false;
+
+      if (!verified_O) {
+        verified_O = verify_tensor_<ElementO>(matrix_O, matrix_Ref_O);
+      }
+
+      passed = passed && verified_O;
+
+      if (!passed) {
+        std::cerr << "\n***\nError - problem " << i << " (batch " << b << ") failed the QA check\n***\n" << std::endl;
+
+        if (!verified_O) {
+          std::cout << "Final matrix output is incorrect" << std::endl;
+        }
+
+        return passed;
+      }
+    }
+
+    return passed;
+  }
+
+public:
+
+
+  /// Executes a CUTLASS Attention kernel and measures runtime.
+  Result profile() {
+
+    Result result;
+    result.passed = false;
+
+    // Initialize the problem
+    initialize_();
+
+    typename Attention::Params p;
+    { // set parameters
+      p.query_ptr = block_Q.get();
+      p.key_ptr = block_K.get();
+      p.value_ptr = block_V.get();
+      p.logsumexp_ptr = nullptr; // Only needed for bw
+      p.output_accum_ptr = nullptr;
+      if (Attention::kNeedsOutputAccumulatorBuffer) {
+        cudaMalloc(&p.output_accum_ptr, block_O.size() * sizeof(typename Attention::output_accum_t));
+      }
+      p.output_ptr = block_O.get();
+
+      // TODO: support arbitrary seq lengths
+      // if (cu_seqlens_q.has_value()) {
+      //   p.cu_seqlens_q_ptr = (int32_t*)cu_seqlens_q->data_ptr();
+      //   p.cu_seqlens_k_ptr = (int32_t*)cu_seqlens_k->data_ptr();
+      // }
+
+      p.scale = options.alpha0;
+
+      p.num_heads = options.head_number;
+      p.num_batches = options.batch_size;
+      p.head_dim = options.head_size;
+      p.head_dim_value = options.head_size_v;
+      p.num_queries = options.seq_length;
+      p.num_keys = options.seq_length_kv;
+      if (options.causal) {
+        p.custom_mask_type = Attention::CausalFromTopLeft;
+      }
+
+      // All tensors are in BMHK shapes
+      p.q_strideH = options.head_size;
+      p.k_strideH = options.head_size;
+      p.v_strideH = options.head_size_v;
+      p.q_strideM = int32_t(ldq_host[0]);
+      p.k_strideM = int32_t(ldk_host[0]);
+      p.v_strideM = int32_t(ldv_host[0]);
+      p.q_strideB = p.q_strideM * options.seq_length;
+      p.k_strideB = p.k_strideM * options.seq_length_kv;
+      p.v_strideB = p.v_strideM * options.seq_length_kv;
+      p.o_strideM = p.head_dim_value * p.num_heads;
+    }
+
+    // launch kernel :)
+    constexpr auto kernel_fn = attention_kernel_batched_impl<Attention>;
+    int smem_bytes = sizeof(typename Attention::SharedStorage);
+    if (smem_bytes > 0xc000) {
+      cudaFuncSetAttribute(kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes);
+    }
+    if (!Attention::check_supported(p)) {
+      std::cerr << "Kernel does not support these inputs" << std::endl;
+      return result;
+    }
+    kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes>>>(p);
+
+    // Wait for completion
+    result.error = cudaDeviceSynchronize();
+
+    if (result.error != cudaSuccess)  {
+      std::cerr << "Kernel execution error: " << cudaGetErrorString(result.error);
+      return result;
+    }
+
+    //
+    // Verify correctness
+    //
+    result.passed = true;
+
+    if (options.reference_check) {
+      result.passed = verify_();
+    }
+
+    //
+    // Warm-up run
+    //
+
+    kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes>>>(p);
+
+    if (result.status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to run CUTLASS Attention kernel." << std::endl;
+      return result;
+    }
+
+    //
+    // Construct events
+    //
+
+    cudaEvent_t events[2];
+
+    for (auto & event : events) {
+      result.error = cudaEventCreate(&event);
+      if (result.error != cudaSuccess) {
+        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+        return -1;
+      }
+    }
+
+    // Record an event at the start of a series of GEMM operations
+    result.error = cudaEventRecord(events[0]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    //
+    // Run profiling loop
+    //
+
+    for (int iter = 0; iter < options.iterations; ++iter) {
+      kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes>>>(p);
+    }
+
+    //
+    // Stop profiling loop
+    //
+
+    // Record an event when the GEMM operations have been launched.
+    result.error = cudaEventRecord(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Wait for work on the device to complete.
+    result.error = cudaEventSynchronize(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Compute average runtime and GFLOPs.
+    result.runtime_ms = double(runtime_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.runtime_ms / 1000.0);
+
+    //
+    // Cleanup
+    //
+
+    for (auto event : events) {
+      (void)cudaEventDestroy(event);
+    }
+
+    std::cout << std::endl;
+    std::cout << "CUTLASS Attention:\n"
+      << "====================================================" << std::endl;
+    std::cout << "    " << " {seq length Q, seq length KV, head size, head size V, head number, batch size} = {" << options.seq_length \
+      << ", " << options.seq_length_kv << ", " << options.head_size << ", " << options.head_size_v << ", " << options.head_number\
+      << ", " << options.batch_size << "}." << std::endl;
+    std::cout << std::endl;
+    std::cout << "    " << "Runtime: " << result.runtime_ms << " ms" << std::endl;
+    std::cout << "    " << "GFLOPs: " << result.gflops << std::endl;
+
+    return result;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int kQueriesPerBlock,
+  int kKeysPerBlock,
+  int kMaxK
+>
+int run_attention(Options& options) {
+  using Attention = AttentionKernel<
+    cutlass::half_t,      // scalar_t
+    cutlass::arch::Sm80,  // ArchTag
+    true,                 // Memory is aligned
+    kQueriesPerBlock,
+    kKeysPerBlock,
+    kMaxK,
+    false,                // Supports dropout
+    false                 // Supports bias
+  >;
+
+  //
+  // Test and profile
+  //
+
+  TestbedAttention<Attention> testbed(options);
+
+  Result result = testbed.profile();
+  if (!result.passed) {
+    std::cout << "Profiling CUTLASS attention has failed.\n";
+    std::cout << "\nFailed\n";
+    return -1;
+  }
+
+  std::cout << "\nPassed\n";
+  return 0;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  //
+  // This example uses mma.sync to directly access Tensor Cores to achieve peak performance.
+  //
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (__CUDACC_VER_MAJOR__ < 11 || props.major < 8) {
+  
+    //
+    // This example requires an NVIDIA Ampere-architecture GPU.
+    //
+
+    std::cout 
+      << "CUTLASS's CUTLASS Attention example requires a GPU of NVIDIA's Ampere Architecture or "
+      << "later (compute capability 80 or greater).\n";
+
+    return 0;
+  }
+
+  //
+  // Parse options
+  //
+
+  Options options;
+  
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+
+  if (options.use_mask) {
+    std::cerr << "--use_mask is not supported at the moment\n";
+    return -2;
+  }
+  if (options.alignment != 1) {
+    std::cerr << "--alignment=1 is the only supported value\n";
+    return -2;
+  }
+
+  // Determine kernel configuration based on head size.
+  // If head size is less than or equal to 64, each block operates over 64 queries and
+  // 64 keys, and partial results can be stored in the register file.
+  // If head size is greater than 64, each block operates over 32 queries and 128 keys,
+  // and partial results are stored in shared memory.
+  if (options.head_size_v > 64) {
+    static int const kQueriesPerBlock = 32;
+    static int const kKeysPerBlock = 128;
+    if (options.head_size_v <= 128) {
+      return run_attention<kQueriesPerBlock, kKeysPerBlock, 128>(options);
+    } else {
+      return run_attention<kQueriesPerBlock, kKeysPerBlock, 65536>(options);
+    }
+  } else {
+    static constexpr int kMaxK = 64; // <- Decrease to 32/16 if your problem is smaller
+    static int const kQueriesPerBlock = 64;
+    static int const kKeysPerBlock = 64;
+    return run_attention<kQueriesPerBlock, kKeysPerBlock, kMaxK>(options);
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/fused_multihead_attention_variable_seqlen.cu b/third_party/fused_multi_head_attention/fused_multihead_attention_variable_seqlen.cu
new file mode 100644
index 0000000000..6fbc7bc0bf
--- /dev/null
+++ b/third_party/fused_multi_head_attention/fused_multihead_attention_variable_seqlen.cu
@@ -0,0 +1,1195 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief CUTLASS Attention Example.
+
+    This workload computes a fused multi head attention that supports variable sequence lengths.
+    Because it keeps the attention matrix in shared memory, it's both faster and
+    uses less global memory.
+
+    This is based on `"Self-Attention Does Not Need O(n^2) Memory" <http://arxiv.org/abs/2112.05682>`_,
+    and very similar to `"FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness" <https://arxiv.org/abs/2205.14135>`_.
+
+    Algorithm:
+      In short, we can compute the output incrementally in blocks of size B,
+      we just need to divide the final result by the sum of all coefficients in
+      the softmax (which we compute incrementally) with the following pseudo-code:
+
+      ```
+      s_prime = torch.zeros([num_queries, B])
+      O = torch.zeros([num_queries, head_size_v])
+      for i in range(0, K.shape[0], B):
+        si = exp((Q . K[i * B:(i+1) * B].t) * scale)
+        sum_coefs += attn_unscaled.sum(-1)
+        O  += si . V[i * B:(i+1) * B]
+      O = O / s_prime
+      ```
+
+      In practice, and for numerical stability reasons,
+      we also subtract the maximum so far (`mi`) before doing
+      the exponential. When we encounter new keys, the maximum
+      used to compute O so far (`m_prime`) can differ from the
+      current maximum, so we update O before accumulating with
+
+      ```
+      O       = O * exp(m_prime - mi)
+      m_prime = mi
+      ```
+
+    Implementation details:
+      - `si` is stored in shared memory between the 2 back to back gemms
+      - we keep and accumulate the output
+      directly in registers if we can (`head_size_v <= 128`).
+      Otherwise, we store it & accumulate in global memory (slower)
+      - blocks are parallelized across the batch dimension, the number
+      of heads, and the query sequence size
+
+
+    Examples:
+
+      # Run an attention example with default setup
+      $ ./examples/41_fused_multi_head_attention/41_fused_multi_head_attention_variable_seqlen
+
+      # Run an attention example with custom setup
+      $ ./examples/41_fused_multi_head_attention/41_fused_multi_head_attention_variable_seqlen --head_number=2 --batch_size=3 --head_size=32 --head_size_v=64 --seq_length=512 --seq_length_kv=1024 --causal=true
+
+      Acknowledgement: Fixed-sequence-length FMHA code was upstreamed by Meta xFormers (https://github.com/facebookresearch/xformers).
+                       Using grouped GEMM to handle variable sequence lengths is inspired by an idea originally prototyped by ByteDance Inc.
+*/
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <vector>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/device/gemm_grouped.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm_complex.h"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/kernel/default_gemm_complex.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/fast_math.h"
+
+#include "default_fmha_grouped.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Result structure
+struct Result {
+
+  double runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  //
+  // Methods
+  //
+
+  Result(
+    double runtime_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess
+  ):
+    runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+  bool error;
+  bool reference_check;
+  bool use_mask;
+  bool causal;
+  bool fixed_seq_length;
+
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes0;
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes1;
+
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes0_real;
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes1_real;
+
+  int alignment;
+  int head_number;
+  int batch_size;
+  int head_size;
+  int head_size_v;
+  int seq_length;
+  int seq_length_kv;
+  int iterations;
+  int problem_count;
+
+  // alpha0, alpha1 and beta are fixed 
+  // in this multi-head attention example
+  float alpha0;
+  float alpha1;
+  float beta;
+
+  cutlass::gemm::kernel::GroupScheduleMode scheduler_mode;
+
+  //
+  // Methods
+  // 
+
+  Options():
+    help(false),
+    error(false),
+    alignment(1),
+    reference_check(true),
+    head_number(12),
+    batch_size(16),
+    head_size(64),
+    head_size_v(64),
+    seq_length(1024),
+    seq_length_kv(1024),
+    use_mask(false),
+    iterations(20),
+    causal(false),
+    fixed_seq_length(false),
+    problem_count(batch_size * head_number),
+    scheduler_mode(cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly)
+  { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("alignment", alignment, 1);
+    cmd.get_cmd_line_argument("head_number", head_number, 12);
+    cmd.get_cmd_line_argument("batch_size", batch_size, 16);
+    cmd.get_cmd_line_argument("head_size", head_size, 64);
+    cmd.get_cmd_line_argument("head_size_v", head_size_v, head_size);
+    cmd.get_cmd_line_argument("seq_length", seq_length, 1024);
+    cmd.get_cmd_line_argument("seq_length_kv", seq_length_kv, seq_length);
+    cmd.get_cmd_line_argument("use_mask", use_mask, false);
+    cmd.get_cmd_line_argument("iterations", iterations, 20);
+    cmd.get_cmd_line_argument("reference-check", reference_check, true);
+    cmd.get_cmd_line_argument("causal", causal, true);
+    cmd.get_cmd_line_argument("fixed_seq_length", fixed_seq_length, false);
+
+    std::vector<std::string> scheduler_mode_strs;
+    cmd.get_cmd_line_arguments("scheduler-mode", scheduler_mode_strs);
+
+    if (!scheduler_mode_strs.empty()) {
+      if (scheduler_mode_strs.size() > 1) {
+        std::cerr << "Only one scheduler mode may be passed in" << std::endl;
+        error = true;
+        return;
+      }
+      std::string scheduler_mode_str = scheduler_mode_strs[0];
+      if (scheduler_mode_str == "kDeviceOnly") {
+        scheduler_mode = cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly;
+      } else if (scheduler_mode_str == "kHostPrecompute") {
+        scheduler_mode = cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute;
+      } else {
+          std::cerr << "Unrecognized scheduler mode '" << scheduler_mode_str << "'" << std::endl;
+          error = true;
+          return;
+      }
+    }
+
+    if (fixed_seq_length) {
+      std::cout << "NOTE: Better performance is expected for fixed-sized sequence length from 41_fused_multi_head_attention_fixed_seqlen." << std::endl;
+    }
+
+    randomize_problems();
+  }
+
+  void randomize_problems() {
+
+    problem_count = head_number * batch_size;
+
+    problem_sizes0.reserve(problem_count);
+    problem_sizes1.reserve(problem_count);
+
+    // When using mask, the original inputs are not padded
+    // and we need to save these info.
+    if (use_mask) {
+      problem_sizes0_real.reserve(problem_count);
+      problem_sizes1_real.reserve(problem_count);
+    }
+
+    for (int i = 0; i < batch_size; ++i) {
+      // problems belonging to the same batch share the same seq len
+
+      int m_real, mkv_real;
+      if (fixed_seq_length) {
+        m_real = seq_length;
+        mkv_real = seq_length_kv;
+      } else {
+        m_real = (rand() % seq_length) + 1;
+
+        // Only randomize seq_length_kv if it was set to a different value than
+        // seq_length originally.
+        if (seq_length != seq_length_kv) {
+          mkv_real = (rand() % seq_length_kv) + 1;
+        } else {
+          mkv_real = m_real;
+        }
+      }
+
+      int m = (m_real + alignment - 1) / alignment * alignment;
+      int mkv = (mkv_real + alignment - 1) / alignment * alignment;
+      int k0 = head_size;
+      int k1 = head_size_v;
+
+      for (int j = 0; j < head_number; ++j) {
+        cutlass::gemm::GemmCoord problem0(m, mkv, k0);
+        cutlass::gemm::GemmCoord problem1(m, k1, mkv);
+
+        problem_sizes0.push_back(problem0);
+        problem_sizes1.push_back(problem1);
+
+        if (use_mask) {
+          cutlass::gemm::GemmCoord problem0_real(m_real, mkv_real, k0);
+          cutlass::gemm::GemmCoord problem1_real(m_real, k1, mkv_real);
+          problem_sizes0_real.push_back(problem0_real);
+          problem_sizes1_real.push_back(problem1_real);
+        }
+
+      }
+    }
+  }
+
+  void print_problems() {
+    std::cout << "     Running " << batch_size << " batches, each with " << head_number << " heads of size " << head_size << ":" << std::endl;
+    for (int i = 0; i < batch_size; ++i) {
+      int idx = i * head_number;
+      std::cout << "       [" << i << "] seq_length = " << problem_sizes0[idx].m() << " seq_length_kv = " << problem_sizes0[idx].n() << std::endl;
+    }
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "41_fused_multi_head_attention_variable_seqlen\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement.\n\n"
+      << "  --head_number=<int>         Head number in multi-head attention (default: --head_number=12)\n"
+      << "  --batch_size=<int>          Batch size in multi-head attention (default: --batch_size=16)\n"
+      << "  --head_size=<int>           Head size in multi-head attention (default: --head_size=64)\n"
+      << "  --head_size_v=<int>         Head size in multi-head attention for V (default: --head_size_v=head_size)\n"
+      << "  --seq_length=<int>          Sequence length in multi-head attention for Q (default: --seq_length=1024)\n"
+      << "  --seq_length_kv=<int>       Sequence length in multi-head attention for K/V (default: --seq_length_kv=seq_length)\n"
+      << "  --use_mask=<bool>           If true, performs padding-like masking in softmax.\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n"
+      << "  --reference-check=<bool>    If true, performs reference check.\n"
+      << "  --causal=<bool>             If true, uses causal masking.\n"
+      << "  --fixed_seq_length=<bool>   If true, uses the same sequence length for each item in the batch.\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+
+    // Number of real-valued multiply-adds 
+    int64_t fops = int64_t();
+
+    for (size_t i = 0; i < problem_sizes0.size(); ++i) {
+      auto const& problem0 = problem_sizes0[i];
+      auto const& problem1 = problem_sizes1[i];
+
+      for (int row = 0; row < problem0.m(); ++row) {
+        int num_cols0 = problem0.n();
+        if (causal) {
+          num_cols0 = std::min(row + 1, num_cols0);
+        }
+        // P <- Q . K_t
+        fops += 2 * num_cols0 * problem0.k();
+        // P <- exp(P - max(P))
+        fops += 2 * num_cols0;
+        // S <- sum(P)
+        fops += num_cols0 - 1;
+        // O <- P . V
+        fops += 2 * num_cols0 * problem1.n();
+        // O <- O / S
+        fops += num_cols0 * problem1.n();
+      }
+    }
+
+    return double(fops) / double(1.0e9) / runtime_s;
+  }
+};
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Attention>
+class TestbedAttention {
+public:
+
+  //
+  // Type definitions
+  //
+
+  using scalar_t = typename Attention::GemmKernel::scalar_t;
+  using accum_t = typename Attention::GemmKernel::accum_t;
+  using output_t = typename Attention::GemmKernel::output_t;
+  using output_accum_t = typename Attention::GemmKernel::output_accum_t;
+
+  using ElementQ = scalar_t;
+  using ElementK = scalar_t;
+  using ElementP = accum_t;
+  using ElementAccumulator = accum_t;
+  using ElementV = scalar_t;
+  using ElementO = output_t;
+  using ElementOAccum = output_accum_t;
+
+  using ElementCompute = accum_t;
+
+  using ElementNorm = accum_t;
+  using ElementSum = accum_t;
+  using ElementSoftmaxCompute = accum_t;
+
+  using LayoutQ = cutlass::layout::RowMajor;
+  using LayoutK = cutlass::layout::ColumnMajor;
+  using LayoutP = cutlass::layout::RowMajor;
+  using LayoutV = cutlass::layout::RowMajor;
+  using LayoutO = cutlass::layout::RowMajor;
+
+  using MatrixCoord = typename LayoutP::TensorCoord;
+
+  static bool const kNeedsOutputAccumulatorBuffer = Attention::GemmKernel::kNeedsOutputAccumulatorBuffer;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Options & options;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_Q;
+  cutlass::Distribution::Kind init_K;
+  cutlass::Distribution::Kind init_P;
+  cutlass::Distribution::Kind init_V;
+  cutlass::Distribution::Kind init_O;
+  uint32_t seed;
+
+  cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> problem_sizes_device0;
+  cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> problem_sizes_device1;
+  cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> problem_sizes_device0_real;
+
+  std::vector<int64_t> offset_Q;
+  std::vector<int64_t> offset_K;
+  std::vector<int64_t> offset_P;
+  std::vector<int64_t> offset_V;
+  std::vector<int64_t> offset_O;
+
+  std::vector<int64_t> ldq_host;
+  std::vector<int64_t> ldk_host;
+  std::vector<int64_t> ldp_host;
+  std::vector<int64_t> ldv_host;
+  std::vector<int64_t> ldo_host;
+  std::vector<int64_t> seqlen_host;
+
+  cutlass::DeviceAllocation<int64_t> ldq;
+  cutlass::DeviceAllocation<int64_t> ldk;
+  cutlass::DeviceAllocation<int64_t> ldp;
+  cutlass::DeviceAllocation<int64_t> ldv;
+  cutlass::DeviceAllocation<int64_t> ldo;
+  cutlass::DeviceAllocation<int64_t> seqlen;
+
+  cutlass::DeviceAllocation<ElementQ> block_Q;
+  cutlass::DeviceAllocation<ElementK> block_K;
+  cutlass::DeviceAllocation<ElementP> block_P;
+  cutlass::DeviceAllocation<ElementV> block_V;
+  cutlass::DeviceAllocation<ElementO> block_O;
+  cutlass::DeviceAllocation<ElementOAccum> block_O_accumulate;
+  cutlass::DeviceAllocation<ElementNorm> block_Norm;
+  cutlass::DeviceAllocation<ElementSum> block_Sum;
+
+  cutlass::DeviceAllocation<int64_t> offset_P_Device;
+
+  cutlass::DeviceAllocation<ElementQ *> ptr_Q;
+  cutlass::DeviceAllocation<ElementK *> ptr_K;
+  cutlass::DeviceAllocation<ElementP *> ptr_P;
+  cutlass::DeviceAllocation<ElementV *> ptr_V;
+  cutlass::DeviceAllocation<ElementO *> ptr_O;
+  cutlass::DeviceAllocation<ElementOAccum *> ptr_O_accumulate;
+
+
+public:
+
+  //
+  // Methods
+  //
+
+  TestbedAttention(
+    Options &options_,
+    cutlass::Distribution::Kind init_Q_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_K_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_P_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_V_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_O_ = cutlass::Distribution::Uniform,
+    uint32_t seed_ = 3080
+  ):
+    options(options_), init_Q(init_Q_), init_K(init_K_), init_P(init_P_), init_V(init_V_), init_O(init_O_), seed(seed_) { }
+
+  int problem_count() const {
+    return (options.head_number * options.batch_size);
+  }
+
+private:
+
+  /// Helper to initialize a tensor view
+  template <typename Element>
+  void initialize_tensor_(
+    Element *ptr,
+    size_t capacity, 
+    cutlass::Distribution::Kind dist_kind,
+    uint32_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      Element scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<ElementP>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 8;
+        scope_min = -8;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::device::BlockFillRandomUniform(
+        ptr, capacity, seed, scope_max, scope_min, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::device::BlockFillRandomGaussian(
+        ptr, capacity, seed, Element(), Element(0.5f));
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      // Fill with increasing elements
+      cutlass::reference::device::BlockFillSequential(
+        ptr, capacity, Element(1), Element());
+    } 
+    else {
+
+      // Fill with all 1s
+      cutlass::reference::device::BlockFillSequential(
+        ptr, capacity, Element(), Element(1));
+    }
+  }
+
+  /// Initializes data structures
+  void initialize_() {
+
+    //
+    // Set scalors for the mha example
+    //
+
+    options.alpha0 = 1.0f / sqrt(float(options.head_size));
+    options.alpha1 = 1.0f;
+    options.beta = 0;
+
+    //
+    // Choose random problem sizes
+    //
+
+    // construct a few problems of random sizes
+    srand(seed);
+
+    int64_t total_elements_Q = 0;
+    int64_t total_elements_K = 0;
+    int64_t total_elements_P = 0;
+    int64_t total_elements_V = 0;
+    int64_t total_elements_O = 0;
+
+    ldq_host.resize(problem_count());
+    ldk_host.resize(problem_count());
+    ldp_host.resize(problem_count());
+    ldv_host.resize(problem_count());
+    ldo_host.resize(problem_count());
+    seqlen_host.resize(problem_count());
+
+    for (int32_t i = 0; i < problem_count(); ++i) {
+
+      auto problem0 = options.problem_sizes0.at(i);
+      auto problem1 = options.problem_sizes1.at(i);
+
+      ldq_host.at(i) = LayoutQ::packed({problem0.m(), problem0.k()}).stride(0);
+      ldk_host.at(i) = LayoutK::packed({problem0.k(), problem0.n()}).stride(0);
+      ldp_host.at(i) = LayoutP::packed({problem0.m(), problem0.n()}).stride(0);
+      ldv_host.at(i) = LayoutV::packed({problem1.k(), problem1.n()}).stride(0);
+      ldo_host.at(i) = LayoutO::packed({problem1.m(), problem1.n()}).stride(0);
+
+      // m = n for attention problems.
+      seqlen_host.at(i) = problem0.m();
+
+      offset_Q.push_back(total_elements_Q);
+      offset_K.push_back(total_elements_K);
+      offset_P.push_back(total_elements_P);
+      offset_V.push_back(total_elements_V);
+      offset_O.push_back(total_elements_O);
+
+      int64_t elements_Q = problem0.m() * problem0.k();
+      int64_t elements_K = problem0.k() * problem0.n();
+      int64_t elements_P = problem0.m() * problem0.n();
+      int64_t elements_V = problem1.k() * problem1.n();
+      int64_t elements_O = problem1.m() * problem1.n();
+
+      total_elements_Q += elements_Q;
+      total_elements_K += elements_K;
+      total_elements_P += elements_P;
+      total_elements_V += elements_V;
+      total_elements_O += elements_O;
+
+    }
+
+    problem_sizes_device0.reset(problem_count());
+    problem_sizes_device1.reset(problem_count());
+    problem_sizes_device0.copy_from_host(options.problem_sizes0.data());
+    problem_sizes_device1.copy_from_host(options.problem_sizes1.data());
+
+    if (options.use_mask) {
+      problem_sizes_device0_real.reset(problem_count());
+      problem_sizes_device0_real.copy_from_host(options.problem_sizes0_real.data());
+    }
+
+    ldq.reset(problem_count());
+    ldk.reset(problem_count());
+    ldp.reset(problem_count());
+    ldv.reset(problem_count());
+    ldo.reset(problem_count());
+    seqlen.reset(problem_count());
+
+    ldq.copy_from_host(ldq_host.data());
+    ldk.copy_from_host(ldk_host.data());
+    ldp.copy_from_host(ldp_host.data());
+    ldv.copy_from_host(ldv_host.data());
+    ldo.copy_from_host(ldo_host.data());
+    seqlen.copy_from_host(seqlen_host.data());
+
+    //
+    // Assign pointers
+    //
+
+    block_Q.reset(total_elements_Q);
+    block_K.reset(total_elements_K);
+    block_P.reset(total_elements_P);
+    block_V.reset(total_elements_V);
+    block_O.reset(total_elements_O);
+
+    if (kNeedsOutputAccumulatorBuffer) {
+      block_O_accumulate.reset(total_elements_O);
+    }
+
+    offset_P_Device.reset(problem_count());
+
+    // sync offset with device
+    cutlass::device_memory::copy_to_device(offset_P_Device.get(), offset_P.data(), offset_P.size());
+
+    std::vector<ElementQ *> ptr_Q_host(problem_count());
+    std::vector<ElementK *> ptr_K_host(problem_count());
+    std::vector<ElementP *> ptr_P_host(problem_count());
+    std::vector<ElementV *> ptr_V_host(problem_count());
+    std::vector<ElementO *> ptr_O_host(problem_count());
+    std::vector<ElementOAccum *> ptr_O_accumulate_host(problem_count());
+    std::vector<ElementNorm *> ptr_norm_host(problem_count());
+    std::vector<ElementSum *> ptr_sum_host(problem_count());
+
+    for (int32_t i = 0; i < problem_count(); ++i) {
+      ptr_Q_host.at(i) = block_Q.get() + offset_Q.at(i);
+      ptr_K_host.at(i) = block_K.get() + offset_K.at(i);
+      ptr_P_host.at(i) = block_P.get() + offset_P.at(i);
+      ptr_V_host.at(i) = block_V.get() + offset_V.at(i);
+      ptr_O_host.at(i) = block_O.get() + offset_O.at(i);
+
+      if (kNeedsOutputAccumulatorBuffer) {
+        ptr_O_accumulate_host.at(i) = block_O_accumulate.get() + offset_O.at(i);
+      }
+    }
+
+    ptr_Q.reset(problem_count());
+    ptr_Q.copy_from_host(ptr_Q_host.data());
+    
+    ptr_K.reset(problem_count());
+    ptr_K.copy_from_host(ptr_K_host.data());
+    
+    ptr_P.reset(problem_count());
+    ptr_P.copy_from_host(ptr_P_host.data());
+
+    ptr_V.reset(problem_count());
+    ptr_V.copy_from_host(ptr_V_host.data());
+
+    ptr_O.reset(problem_count());
+    ptr_O.copy_from_host(ptr_O_host.data());
+
+    if (kNeedsOutputAccumulatorBuffer) {
+      ptr_O_accumulate.reset(problem_count());
+      ptr_O_accumulate.copy_from_host(ptr_O_accumulate_host.data());
+    }
+
+    //
+    // Initialize the problems of the workspace
+    //
+
+    initialize_tensor_(block_Q.get(), total_elements_Q, init_Q, seed + 1);
+    initialize_tensor_(block_K.get(), total_elements_K, init_K, seed + 2);
+    initialize_tensor_(block_V.get(), total_elements_V, init_V, seed + 3);
+
+  }
+
+  template<typename Element>
+  bool verify_tensor_(std::vector<Element> vector_Input, \
+                       std::vector<Element> vector_Input_Ref,
+                       int64_t verify_length = -1) {
+
+    int64_t size = (vector_Input.size() < vector_Input_Ref.size()) ? vector_Input.size() : vector_Input_Ref.size();
+    size = (verify_length == -1) ? size : verify_length;
+
+    // 0.05 for absolute error
+    float abs_tol = 5e-2f;
+    // 10% for relative error
+    float rel_tol = 1e-1f;
+    for (int64_t i = 0; i < size; ++i) {
+      float diff = (float)(vector_Input.at(i) - vector_Input_Ref.at(i));
+      float abs_diff = fabs(diff);
+      float abs_ref = fabs((float)vector_Input_Ref.at(i) + 1e-5f);
+      float relative_diff = abs_diff / abs_ref;
+      if ( (isnan(abs_diff) || isinf(abs_diff)) ||  (abs_diff > abs_tol && relative_diff > rel_tol)) {
+        printf("[%d/%d] diff = %f, rel_diff = %f, {computed=%f, ref=%f}.\n", int(i), int(size), abs_diff, relative_diff, (float)(vector_Input.at(i)), (float)(vector_Input_Ref.at(i)));
+        return false;
+      }
+
+    }
+    
+    return true;
+  }
+
+  /// Verifies the result is a GEMM
+  bool verify_() {
+
+    bool passed = true;
+
+    for (int32_t i = 0; i < problem_count(); ++i) {
+      cutlass::gemm::GemmCoord problem0 = options.problem_sizes0.at(i);
+      cutlass::gemm::GemmCoord problem1 = options.problem_sizes1.at(i);
+
+      LayoutQ layout_Q(ldq_host.at(i));
+      LayoutK layout_K(ldk_host.at(i));
+      LayoutP layout_P(ldp_host.at(i));
+      LayoutV layout_V(ldv_host.at(i));
+      LayoutO layout_O(ldo_host.at(i));
+
+      MatrixCoord extent_Q{problem0.m(), problem0.k()};
+      MatrixCoord extent_K{problem0.k(), problem0.n()};
+      MatrixCoord extent_P{problem0.m(), problem0.n()};
+      MatrixCoord extent_V{problem1.k(), problem1.n()};
+      MatrixCoord extent_O{problem1.m(), problem1.n()};
+
+      cutlass::TensorView<ElementQ, LayoutQ> view_Q(block_Q.get() + offset_Q.at(i), layout_Q, extent_Q);
+      cutlass::TensorView<ElementK, LayoutK> view_K(block_K.get() + offset_K.at(i), layout_K, extent_K);
+      cutlass::TensorView<ElementP, LayoutP> view_P(block_P.get() + offset_P.at(i), layout_P, extent_P);
+      cutlass::TensorView<ElementV, LayoutV> view_V(block_V.get() + offset_V.at(i), layout_V, extent_V);
+
+      cutlass::DeviceAllocation<ElementP>    block_Ref(layout_P.capacity(extent_P));
+      cutlass::TensorView<ElementP, LayoutP> view_Ref_device(block_Ref.get(), layout_P, extent_P);
+
+      cutlass::DeviceAllocation<ElementO>    block_Ref_O(layout_O.capacity(extent_O));
+      cutlass::TensorView<ElementO, LayoutO> view_Ref_O_device(block_Ref_O.get(), layout_O, extent_O);
+      cutlass::reference::device::TensorFill(view_Ref_O_device, ElementO(0));
+
+      // Reference GEMM
+      cutlass::reference::device::GemmComplex<
+          ElementQ, LayoutQ,
+          ElementK, LayoutK,
+          ElementP, LayoutP, 
+          ElementCompute, ElementAccumulator
+      >(
+        problem0,
+        ElementAccumulator(options.alpha0), 
+        view_Q,
+        Attention::GemmKernel::MM0::Mma::kTransformA,
+        view_K,
+        Attention::GemmKernel::MM0::Mma::kTransformB,
+        ElementAccumulator(options.beta), 
+        view_P, 
+        view_Ref_device, 
+        ElementAccumulator(0)
+      );
+
+      // Compute softmax for P. We need to explicitly compute softmax
+      // over P because softmax is fused to the second GEMM in the
+      // profiled implementation.
+      std::vector<ElementP> matrix_Ref(layout_P.capacity(extent_P));
+      cutlass::device_memory::copy_to_host(matrix_Ref.data(), block_Ref.get(), matrix_Ref.size());
+      cutlass::TensorView<ElementP, LayoutP> view_Ref_host(matrix_Ref.data(), layout_P, extent_P);
+      std::vector<ElementNorm> vector_Norm_Ref(problem0.m());
+      std::vector<ElementSum> vector_Sum_Ref(problem0.m());
+
+      int n_dim = options.use_mask ? options.problem_sizes0_real.at(i).n() : problem0.n();
+
+      // Compute softmax for reference matrix
+      for (int m = 0; m < problem0.m(); m++) {
+        int n_dim_row = n_dim;
+        if (options.causal) {
+          n_dim_row = std::min(m + 1, n_dim);
+        }
+        ElementSoftmaxCompute max = ElementSoftmaxCompute(view_Ref_host.ref().at({m, 0}));
+        for (int n = 1; n < n_dim_row; n++) {
+           max = std::max(max, ElementSoftmaxCompute(view_Ref_host.ref().at({m, n})));
+        }
+
+        vector_Norm_Ref.at(m) = ElementNorm(max);
+
+        ElementSoftmaxCompute sum = ElementSoftmaxCompute();
+        for (int n = 0; n < n_dim_row; n++) {
+          sum += std::exp( ElementSoftmaxCompute(view_Ref_host.ref().at({m, n})) - max );
+        }
+        ElementSoftmaxCompute inv_sum = ElementSoftmaxCompute(1.0f / sum);
+
+        vector_Sum_Ref.at(m) = ElementSum(inv_sum);
+
+        for (int n = 0; n < n_dim_row; n++) {
+          view_Ref_host.ref().at({m, n}) = ElementP(
+            std::exp( ElementSoftmaxCompute(view_Ref_host.ref().at({m, n})) - max ) * inv_sum
+          );
+        }
+        // Mask out the rest of the attention matrix
+        for (int n = n_dim_row; n < n_dim; ++n) {
+          view_Ref_host.ref().at({m, n}) = ElementP(0);
+        }
+
+      }
+
+      // when not using mask, problem_real and problem share the same sizes
+      if (options.use_mask) {
+        for (int m = 0; m < problem0.m(); m++) {
+          for (int n = n_dim; n < problem0.n(); n++) {
+            view_Ref_host.ref().at({m, n}) = ElementP(0);
+          }
+        }
+      }
+
+      cutlass::device_memory::copy_to_device(block_P.get() + offset_P.at(i), matrix_Ref.data(), matrix_Ref.size());
+
+      // Reference GEMM
+      cutlass::reference::device::GemmComplex<
+          ElementP, LayoutP,
+          ElementV, LayoutV,
+          ElementO, LayoutO, 
+          ElementCompute, ElementAccumulator
+      >(
+        problem1,
+        ElementAccumulator(options.alpha1), 
+        view_P,
+        Attention::GemmKernel::MM0::Mma::kTransformA,
+        view_V,
+        Attention::GemmKernel::MM0::Mma::kTransformB,
+        ElementAccumulator(options.beta), 
+        view_Ref_O_device, 
+        view_Ref_O_device, 
+        ElementAccumulator(0)
+      );
+
+      // Copy to host memory
+      cutlass::TensorView<ElementP, LayoutP> view_Ref(matrix_Ref.data(), layout_P, extent_P);
+
+      std::vector<ElementO> matrix_O(layout_O.capacity(extent_O));
+      cutlass::device_memory::copy_to_host(matrix_O.data(),   block_O.get() + offset_O.at(i), matrix_O.size());
+      std::vector<ElementO> matrix_Ref_O(layout_O.capacity(extent_O));
+      cutlass::device_memory::copy_to_host(matrix_Ref_O.data(), block_Ref_O.get(), matrix_Ref_O.size());
+
+
+      bool verified_O = false;
+      if (!verified_O) {
+        verified_O = verify_tensor_<ElementO>(matrix_O, matrix_Ref_O);
+      }
+
+      passed = passed && verified_O;
+
+      if (!passed) {
+        std::cerr << "\n***\nError - problem " << i << " failed the QA check\n***\n" << std::endl;
+
+        if (!verified_O) {
+          std::cout << "Final matrix output is incorrect" << std::endl;
+        }
+
+        return passed;
+      }
+
+    }
+
+    return passed;
+  }
+
+public:
+
+
+  /// Executes a CUTLASS Attention kernel and measures runtime.
+  Result profile() {
+
+    Result result;
+    result.passed = false;
+
+    int threadblock_count = Attention::sufficient(options.problem_sizes1.data(), options.problem_count);
+
+    // Early exit
+    if (!threadblock_count) {
+      std::cout << "Active CUDA device lacks hardware resources to run CUTLASS Grouped FMHA kernel." << std::endl;
+      return result;
+    }
+
+    result.passed = false;
+
+    // Initialize the problem
+    initialize_();
+
+    typename Attention::Arguments args(
+      problem_sizes_device0.get(),
+      problem_sizes_device1.get(),
+      options.problem_count,
+      threadblock_count,
+      ptr_Q.get(),
+      ptr_K.get(),
+      ptr_P.get(),
+      ptr_V.get(),
+      ptr_O.get(),
+      ptr_O_accumulate.get(),
+      ldq.get(),
+      ldk.get(),
+      ldp.get(),
+      ldv.get(),
+      ldo.get(),
+      options.causal,
+      options.alpha0,
+      options.problem_sizes1.data()
+    );
+
+    Attention fmha;
+
+    size_t workspace_size = fmha.get_workspace_size(args);
+    cutlass::DeviceAllocation<uint8_t> workspace(workspace_size);
+
+    result.status = fmha.initialize(args, workspace.get());
+
+    if (result.status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to initialize CUTLASS Grouped FMHA kernel." << std::endl;
+      return result;
+    }
+
+    // Run the grouped FMHA object
+    result.status = fmha.run();
+
+    if (result.status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to run CUTLASS Grouped FMHA kernel." << std::endl;
+      return result;
+    }
+
+    // Wait for completion
+    result.error = cudaDeviceSynchronize();
+
+    if (result.error != cudaSuccess)  {
+      std::cerr << "Kernel execution error: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    //
+    // Verify correctness
+    //
+    result.passed = true;
+
+    if (options.reference_check) {
+      result.passed = verify_();
+    }
+
+    //
+    // Warm-up run of the grouped FMHA object
+    //
+    result.status = fmha.run();
+
+    if (result.status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to run CUTLASS Grouped FMHA kernel." << std::endl;
+      return result;
+    }
+
+    //
+    // Construct events
+    //
+
+    cudaEvent_t events[2];
+
+    for (auto & event : events) {
+      result.error = cudaEventCreate(&event);
+      if (result.error != cudaSuccess) {
+        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+        return -1;
+      }
+    }
+
+    // Record an event at the start of a series of FMHA operations
+    result.error = cudaEventRecord(events[0]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    //
+    // Run profiling loop
+    //
+
+    for (int iter = 0; iter < this->options.iterations; ++iter) {
+      fmha();
+    }
+
+    //
+    // Stop profiling loop
+    //
+
+    // Record an event when the GEMM operations have been launched.
+    result.error = cudaEventRecord(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Wait for work on the device to complete.
+    result.error = cudaEventSynchronize(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Compute average runtime and GFLOPs.
+    result.runtime_ms = double(runtime_ms) / double(this->options.iterations);
+    result.gflops = this->options.gflops(result.runtime_ms / 1000.0);
+
+    //
+    // Cleanup
+    //
+
+    for (auto event : events) {
+      (void)cudaEventDestroy(event);
+    }
+
+    std::cout << std::endl;
+    std::cout << "CUTLASS Attention:\n"
+      << "====================================================" << std::endl;
+    std::cout << "    " << " {seq length Q, seq length KV, head size, head size V, head number, batch size} = {" << options.seq_length \
+      << ", " << options.seq_length_kv << ", " << options.head_size << ", " << options.head_size_v << ", " << options.head_number\
+      << ", " << options.batch_size << "}." << std::endl;
+    options.print_problems();
+    std::cout << std::endl;
+    std::cout << "    " << "Runtime: " << result.runtime_ms << " ms" << std::endl;
+    std::cout << "    " << "GFLOPs: " << result.gflops << std::endl;
+
+    return result;
+  }
+
+
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int kQueriesPerBlock,
+  int kKeysPerBlock,
+  int kMaxK,
+  cutlass::gemm::kernel::GroupScheduleMode GroupScheduleMode_
+>
+int run_grouped(Options& options) {
+  using AttentionKernel = typename cutlass::gemm::kernel::DefaultFMHAGrouped<
+    cutlass::half_t,      // scalar_t
+    cutlass::arch::Sm80,  // ArchTag
+    true,                 // Memory is aligned
+    kQueriesPerBlock,
+    kKeysPerBlock,
+    kMaxK,
+    GroupScheduleMode_
+  >::FMHAKernel;
+
+  using FMHA = cutlass::gemm::device::GemmGrouped<AttentionKernel>;
+
+  //
+  // Test and profile
+  //
+
+  TestbedAttention<FMHA> testbed(options);
+
+  Result result = testbed.profile();
+  if (!result.passed) {
+    std::cout << "Profiling CUTLASS attention has failed.\n";
+    std::cout << "\nFailed\n";
+    return -1;
+  }
+
+  std::cout << "\nPassed\n";
+  return 0;
+}
+
+
+template <
+  int kQueriesPerBlock,
+  int kKeysPerBlock,
+  int kMaxK
+>
+int run_attention(Options& options) {
+  if (options.scheduler_mode == cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly) {
+    return run_grouped<kQueriesPerBlock,
+                       kKeysPerBlock,
+                       kMaxK,
+                       cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly>(options);
+  } else {
+    return run_grouped<kQueriesPerBlock,
+                       kKeysPerBlock,
+                       kMaxK,
+                       cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute>(options);
+  }
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  //
+  // This example uses mma.sync to directly access Tensor Cores to achieve peak performance.
+  //
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (__CUDACC_VER_MAJOR__ < 11 || props.major < 8) {
+  
+    //
+    // This example requires an NVIDIA Ampere-architecture GPU.
+    //
+
+    std::cout 
+      << "CUTLASS's CUTLASS Attention example requires a GPU of NVIDIA's Ampere Architecture or "
+      << "later (compute capability 80 or greater).\n";
+
+    return 0;
+  }
+
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+
+  if (options.use_mask) {
+    std::cerr << "--use_mask is not supported at the moment\n";
+    return -2;
+  }
+  if (options.alignment != 1) {
+    std::cerr << "--alignment=1 is the only supported value\n";
+    return -2;
+  }
+
+  // Determine kernel configuration based on head size.
+  // If head size is less than or equal to 64, each block operates over 64 queries and
+  // 64 keys, and partial results can be stored in the register file.
+  // If head size is greater than 64, each block operates over 32 queries and 128 keys,
+  // and partial results are stored in shared memory.
+  if (options.head_size_v > 64) {
+    static int const kQueriesPerBlock = 32;
+    static int const kKeysPerBlock = 128;
+    if (options.head_size_v <= kKeysPerBlock) {
+      return run_attention<kQueriesPerBlock, kKeysPerBlock, 128>(options);
+    } else {
+      return run_attention<kQueriesPerBlock, kKeysPerBlock, 65536>(options);
+    }
+  } else {
+    static constexpr int kMaxK = 64; // <- Decrease to 32/16 if your problem is smaller
+    static int const kQueriesPerBlock = 64;
+    static int const kKeysPerBlock = 64;
+    return run_attention<kQueriesPerBlock, kKeysPerBlock, kMaxK>(options);
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/gemm/custom_mma.h b/third_party/fused_multi_head_attention/gemm/custom_mma.h
new file mode 100644
index 0000000000..f3a1d4cbc2
--- /dev/null
+++ b/third_party/fused_multi_head_attention/gemm/custom_mma.h
@@ -0,0 +1,124 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "custom_mma_multistage.h"
+#include "custom_mma_pipelined.h"
+#include "cutlass/gemm/threadblock/mma_multistage.h"
+#include "cutlass/gemm/threadblock/mma_pipelined.h"
+
+template <typename Mma, int kMaxK>
+struct MakeCustomMma;
+
+template <
+    typename Shape,
+    typename IteratorA,
+    typename SmemIteratorA,
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    typename IteratorB,
+    typename SmemIteratorB,
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    typename ElementC,
+    typename LayoutC,
+    typename Policy,
+    int Stages,
+    cutlass::gemm::SharedMemoryClearOption SharedMemoryClear,
+    int kMaxK>
+struct MakeCustomMma<
+    cutlass::gemm::threadblock::MmaMultistage<
+        Shape,
+        IteratorA,
+        SmemIteratorA,
+        CacheOpA,
+        IteratorB,
+        SmemIteratorB,
+        CacheOpB,
+        ElementC,
+        LayoutC,
+        Policy,
+        Stages,
+        SharedMemoryClear>,
+    kMaxK> {
+  // Reduce the number of stages if we don't need that many
+  static int constexpr kStages =
+      kMaxK == cutlass::platform::numeric_limits<int>::max()
+      ? Stages
+      : cutlass::const_min(
+            Stages,
+            (kMaxK + int(Shape::kK) - 1) / int(Shape::kK));
+  using Mma = cutlass::gemm::threadblock::CustomMmaMultistage<
+      Shape,
+      IteratorA,
+      SmemIteratorA,
+      CacheOpA,
+      IteratorB,
+      SmemIteratorB,
+      CacheOpB,
+      ElementC,
+      LayoutC,
+      Policy,
+      kStages,
+      SharedMemoryClear,
+      kMaxK>;
+};
+
+template <
+    typename Shape,
+    typename IteratorA,
+    typename SmemIteratorA,
+    typename IteratorB,
+    typename SmemIteratorB,
+    typename ElementC,
+    typename LayoutC,
+    typename Policy,
+    int kMaxK>
+struct MakeCustomMma<
+    cutlass::gemm::threadblock::MmaPipelined<
+        Shape,
+        IteratorA,
+        SmemIteratorA,
+        IteratorB,
+        SmemIteratorB,
+        ElementC,
+        LayoutC,
+        Policy>,
+    kMaxK> {
+  using Mma = cutlass::gemm::threadblock::CustomMmaPipelined<
+      Shape,
+      IteratorA,
+      SmemIteratorA,
+      IteratorB,
+      SmemIteratorB,
+      ElementC,
+      LayoutC,
+      Policy>;
+};
diff --git a/third_party/fused_multi_head_attention/gemm/custom_mma_base.h b/third_party/fused_multi_head_attention/gemm/custom_mma_base.h
new file mode 100644
index 0000000000..66c099d15b
--- /dev/null
+++ b/third_party/fused_multi_head_attention/gemm/custom_mma_base.h
@@ -0,0 +1,182 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class CustomMmaBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = GemmShape<
+      Shape::kM / WarpGemm::kM,
+      Shape::kN / WarpGemm::kN,
+      Shape::kK / WarpGemm::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations =
+      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  template <typename Element, typename OperandShape, typename OperandLayout>
+  struct OperandSharedStorage {
+    AlignedBuffer<Element, OperandShape::kCount> buffer;
+    using TensorRef = TensorRef<Element, OperandLayout>;
+
+    CUTLASS_DEVICE
+    static OperandLayout Layout() {
+      return OperandLayout::packed({OperandShape::kRow, OperandShape::kColumn});
+    }
+
+    /// Returns a TensorRef to the operand
+    CUTLASS_HOST_DEVICE
+    TensorRef ref() {
+      return TensorRef{buffer.data(), Layout()};
+    }
+  };
+
+  /// Shape of the A matrix operand in shared memory
+  using ShapeA = MatrixShape<
+      Shape::kM + Policy::SmemPaddingA::kRow,
+      Shape::kK * kStages + Policy::SmemPaddingA::kColumn>;
+
+  /// Shape of the B matrix operand in shared memory
+  using ShapeB = MatrixShape<
+      Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+      Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+  using SharedStorageA = OperandSharedStorage<
+      typename Operator::ElementA,
+      ShapeA,
+      typename Operator::LayoutA>;
+  using SharedStorageB = OperandSharedStorage<
+      typename Operator::ElementB,
+      ShapeB,
+      typename Operator::LayoutB>;
+  using TensorRefA = typename SharedStorageA::TensorRef;
+  using TensorRefB = typename SharedStorageB::TensorRef;
+
+  struct SharedStorage {
+    /// Buffer for A operand
+    SharedStorageA operand_A;
+
+    /// Buffer for B operand
+    SharedStorageB operand_B;
+  };
+
+ protected:
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A operand from shared memory
+  typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  CustomMmaBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorageA& shared_storageA,
+      SharedStorageB& shared_storageB,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : warp_tile_iterator_A_(shared_storageA.ref(), lane_idx),
+        warp_tile_iterator_B_(shared_storageB.ref(), lane_idx) {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/gemm/custom_mma_multistage.h b/third_party/fused_multi_head_attention/gemm/custom_mma_multistage.h
new file mode 100644
index 0000000000..145315e413
--- /dev/null
+++ b/third_party/fused_multi_head_attention/gemm/custom_mma_multistage.h
@@ -0,0 +1,760 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/cache_operation.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "custom_mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Upper boundon the K dimension
+    int kMaxK = cutlass::platform::numeric_limits<int>::max(),
+    /// Used for partial specialization
+    typename Enable = bool>
+class CustomMmaMultistage : public CustomMmaBase<Shape_, Policy_, Stages> {
+ public:
+  ///< Base class
+  using Base = CustomMmaBase<Shape_, Policy_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+    static_assert(
+        Base::kWarpGemmIterations > 1,
+        "The pipelined structure requires at least two warp-level "
+        "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) /
+        Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) /
+        Base::kWarpGemmIterations;
+  };
+
+  static bool const kSmemContainsEntireMat = kMaxK <= Shape::kK * Stages;
+  static constexpr int kNumStagesConcurrentLoad =
+      kSmemContainsEntireMat ? Stages : Stages - 1;
+
+ private:
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  bool prologue_done_;
+
+  // Set to `True` to ensure the accumulator will be zero outside the GEMM
+  // footprint
+  bool zero_outside_bounds_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  CustomMmaMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorageA& shared_storageA,
+      typename Base::SharedStorageB& shared_storageB,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : Base(shared_storageA, shared_storageB, thread_idx, warp_idx, lane_idx),
+        smem_iterator_A_(shared_storageA.ref(), thread_idx),
+        smem_iterator_B_(shared_storageB.ref(), thread_idx),
+        prologue_done_(false),
+        zero_outside_bounds_(false) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+  CUTLASS_DEVICE
+  CustomMmaMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage& st,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : CustomMmaMultistage(
+            st.operand_A,
+            st.operand_B,
+            thread_idx,
+            warp_idx,
+            lane_idx) {}
+
+  CUTLASS_DEVICE
+  bool set_prologue_done(bool value) {
+    prologue_done_ = value;
+    return true;
+  }
+
+  CUTLASS_DEVICE
+  bool set_zero_outside_bounds(bool value) {
+    zero_outside_bounds_ = value;
+    return true;
+  }
+
+  template <bool kLoadA = true, bool kLoadB = true>
+  CUTLASS_DEVICE static void prologue(
+      typename Base::SharedStorage& shared_storage,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      int thread_idx,
+      int problem_size_k) {
+    prologue<kLoadA, kLoadB>(
+        shared_storage.operand_A,
+        shared_storage.operand_B,
+        iterator_A,
+        iterator_B,
+        thread_idx,
+        problem_size_k);
+  }
+
+  template <bool kLoadA = true, bool kLoadB = true>
+  CUTLASS_DEVICE static void prologue(
+      typename Base::SharedStorageA& shared_storageA,
+      typename Base::SharedStorageB& shared_storageB,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      int thread_idx,
+      int problem_size_k) {
+    SmemIteratorA smem_iterator_A(shared_storageA.ref(), thread_idx);
+    SmemIteratorB smem_iterator_B(shared_storageB.ref(), thread_idx);
+    int32_t iter = (problem_size_k + Base::Shape::kK - 1) / Base::Shape::kK;
+    _prologue<kLoadA, kLoadB>(
+        iterator_A, iterator_B, iter, smem_iterator_A, smem_iterator_B);
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(
+      IteratorA& iterator_A,
+      IteratorB& iterator_B,
+      int group_start_A = 0,
+      int group_start_B = 0) {
+    iterator_A.set_iteration_index(
+        group_start_A * IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType*>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+            IteratorA::ThreadMap::kElementsPerAccess /
+            IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A.get();
+
+          if (zero_outside_bounds_ ||
+              SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          }
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    iterator_B.set_iteration_index(
+        group_start_B * IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType*>(
+                this->smem_iterator_B_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+            IteratorB::ThreadMap::kElementsPerAccess /
+            IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B.get();
+
+          if (zero_outside_bounds_ ||
+              SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B.valid());
+          }
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  template <bool kLoadA = true, bool kLoadB = true>
+  CUTLASS_DEVICE static void _prologue(
+      IteratorA& iterator_A,
+      IteratorB& iterator_B,
+      int32_t& gemm_k_iterations,
+      SmemIteratorA& smem_iterator_A_,
+      SmemIteratorB& smem_iterator_B_) {
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < kNumStagesConcurrentLoad;
+         ++stage, --gemm_k_iterations) {
+      iterator_A.clear_mask(gemm_k_iterations == 0);
+      iterator_B.clear_mask(gemm_k_iterations == 0);
+
+      iterator_A.set_iteration_index(0);
+      smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType*>(
+                smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorA::Element>::value *
+              IteratorA::ThreadMap::kElementsPerAccess /
+              IteratorA::kAccessesPerVector / 8;
+
+          int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
+
+          if (kLoadA) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, iterator_A.get(), iterator_A.valid());
+          }
+
+          ++iterator_A;
+        }
+
+        ++smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType*>(
+                smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB::Element>::value *
+              IteratorB::ThreadMap::kElementsPerAccess /
+              IteratorB::kAccessesPerVector / 8;
+
+          if (kLoadB) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, iterator_B.get(), iterator_B.valid());
+          }
+
+          ++iterator_B;
+        }
+
+        ++smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A.add_tile_offset({0, 1});
+      iterator_B.add_tile_offset({1, 0});
+
+      smem_iterator_A_.add_tile_offset({0, 1});
+      smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC& accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< initial value of accumulator
+      FragmentC const& src_accum) {
+    //
+    // Prologue
+    //
+
+    if (!prologue_done_) {
+      _prologue<true, true>(
+          iterator_A,
+          iterator_B,
+          gemm_k_iterations,
+          smem_iterator_A_,
+          smem_iterator_B_);
+    } else if (!kSmemContainsEntireMat) {
+      _prologue<false, false>(
+          iterator_A,
+          iterator_B,
+          gemm_k_iterations,
+          smem_iterator_A_,
+          smem_iterator_B_);
+    } else {
+      gemm_k_iterations -= kNumStagesConcurrentLoad;
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    //
+    // Clear the remaining tiles of SMEM. This is a functional requirement for
+    // some kernels so that all accumulator elements outside the GEMM footprint
+    // are zero.
+    //
+
+    if (SharedMemoryClear == SharedMemoryClearOption::kClearLastStage) {
+      /// Iterator to write threadblock-scoped tile of A operand to shared
+      /// memory
+      SmemIteratorA last_smem_iterator_A(this->smem_iterator_A_);
+
+      typename IteratorA::AccessType zero_A;
+      zero_A.clear();
+
+      last_smem_iterator_A.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType*>(
+                last_smem_iterator_A.get());
+
+        *dst_ptr = zero_A;
+
+        ++last_smem_iterator_A;
+      }
+
+      /// Iterator to write threadblock-scoped tile of B operand to shared
+      /// memory
+      SmemIteratorB last_smem_iterator_B(this->smem_iterator_B_);
+      typename IteratorB::AccessType zero_B;
+
+      zero_B.clear();
+      last_smem_iterator_B.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType*>(
+                last_smem_iterator_B.get());
+
+        *dst_ptr = zero_B;
+
+        ++last_smem_iterator_B;
+      }
+    }
+
+    // Waits until kStages-2 stages have committed.
+    cutlass::arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[2];
+    WarpLoadedFragmentB warp_loaded_frag_B[2];
+    WarpTransformedFragmentA warp_transformed_frag_A[2];
+    WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+    Operator warp_mma;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    iterator_A.clear_mask(gemm_k_iterations == 0);
+    iterator_B.clear_mask(gemm_k_iterations == 0);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(
+        warp_transformed_frag_A[0],
+        warp_transformed_frag_B[0],
+        warp_loaded_frag_A[0],
+        warp_loaded_frag_B[0]);
+
+    // tf32x3 kernels use staging accumulation. warp_mma uses a temporary
+    // accumulator and this temporary accumulator is added to the final
+    // accumulator once in every mainloop iteration.
+    plus<FragmentC> plus_accum;
+
+    FragmentC tmp_accum;
+
+    if (platform::is_same<
+            typename Operator::MathOperator,
+            arch::OpMultiplyAddFastF32>::value ||
+        platform::is_same<
+            typename Operator::MathOperator,
+            arch::OpMultiplyAddComplexFastF32>::value) {
+      tmp_accum.clear();
+    }
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-kNumStagesConcurrentLoad);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations);
+
+        // In case of a non-circular buffer ("kSmemContainsEntireMat")
+        // make sure we don't load out of bounds data.
+        if (!kSmemContainsEntireMat ||
+            gemm_k_iterations > (-kNumStagesConcurrentLoad) ||
+            warp_mma_k < Base::kWarpGemmIterations - 1) {
+          this->warp_tile_iterator_A_.load(
+              warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+          this->warp_tile_iterator_B_.load(
+              warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+        }
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0)
+          warp_mma.transform(
+              warp_transformed_frag_A[warp_mma_k % 2],
+              warp_transformed_frag_B[warp_mma_k % 2],
+              warp_loaded_frag_A[warp_mma_k % 2],
+              warp_loaded_frag_B[warp_mma_k % 2]);
+
+        if (platform::is_same<
+                typename Operator::MathOperator,
+                arch::OpMultiplyAddFastF32>::value ||
+            platform::is_same<
+                typename Operator::MathOperator,
+                arch::OpMultiplyAddComplexFastF32>::value) {
+          warp_mma(
+              tmp_accum,
+              warp_transformed_frag_A[warp_mma_k % 2],
+              warp_transformed_frag_B[warp_mma_k % 2],
+              tmp_accum);
+
+          if (warp_mma_k == 0) {
+            accum = plus_accum(accum, tmp_accum);
+            tmp_accum.clear();
+          }
+        } else {
+          warp_mma(
+              accum,
+              warp_transformed_frag_A[warp_mma_k % 2],
+              warp_transformed_frag_B[warp_mma_k % 2],
+              accum);
+        }
+
+        // Issue global->shared copies for the this stage
+        if (!kSmemContainsEntireMat &&
+            warp_mma_k < Base::kWarpGemmIterations - 1) {
+          int group_start_iteration_A, group_start_iteration_B;
+
+          group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+          group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(
+              iterator_A,
+              iterator_B,
+              group_start_iteration_A,
+              group_start_iteration_B);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          if (!kSmemContainsEntireMat) {
+            int group_start_iteration_A, group_start_iteration_B;
+            group_start_iteration_A =
+                (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+            group_start_iteration_B =
+                (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+
+            copy_tiles_and_advance(
+                iterator_A,
+                iterator_B,
+                group_start_iteration_A,
+                group_start_iteration_B);
+          }
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          cutlass::arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.add_tile_offset({0, 1});
+          iterator_B.add_tile_offset({1, 0});
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (!kSmemContainsEntireMat &&
+              smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0,
+                 -Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+          iterator_A.clear_mask(gemm_k_iterations == 0);
+          iterator_B.clear_mask(gemm_k_iterations == 0);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations)
+          warp_mma.transform(
+              warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+              warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+              warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+              warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+      }
+    }
+
+    if (platform::is_same<
+            typename Operator::MathOperator,
+            arch::OpMultiplyAddFastF32>::value ||
+        platform::is_same<
+            typename Operator::MathOperator,
+            arch::OpMultiplyAddComplexFastF32>::value) {
+      accum = plus_accum(accum, tmp_accum);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/gemm/custom_mma_pipelined.h b/third_party/fused_multi_head_attention/gemm/custom_mma_pipelined.h
new file mode 100644
index 0000000000..b967b86c01
--- /dev/null
+++ b/third_party/fused_multi_head_attention/gemm/custom_mma_pipelined.h
@@ -0,0 +1,401 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "custom_mma_base.h"
+#include "cutlass/gemm/gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Transformation applied to A operand
+    typename TransformA_ = NumericArrayConverter<
+        typename SmemIteratorA_::Element,
+        typename IteratorA_::Element,
+        IteratorA_::Fragment::kElements>,
+    ///
+    /// Transformation applied to B operand
+    typename TransformB_ = NumericArrayConverter<
+        typename SmemIteratorB_::Element,
+        typename IteratorB_::Element,
+        IteratorB_::Fragment::kElements>,
+    /// Used for partial specialization
+    typename Enable = bool>
+class CustomMmaPipelined : public CustomMmaBase<Shape_, Policy_, 2> {
+ public:
+  ///< Base class
+  using Base = CustomMmaBase<Shape_, Policy_, 2>;
+
+  using Shape =
+      Shape_; ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorA =
+      IteratorA_; ///< Iterates over tiles of A operand in global memory
+  using IteratorB =
+      IteratorB_; ///< Iterates over tiles of B operand in global memory
+  using ElementC = ElementC_; ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_; ///< Layout of accumulator matrix
+  using Policy = Policy_; ///< Policy describing tuning details
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand A loaded from global memory
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy::Operator::ArchTag;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+  static_assert(
+      (Base::kStages == 2),
+      "MmaPipelined requires kStages set to value 2");
+
+  static bool const kSmemContainsEntireMat = false;
+
+ private:
+  using WarpFragmentA = typename Operator::FragmentA;
+  using WarpFragmentB = typename Operator::FragmentB;
+
+ protected:
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  CustomMmaPipelined(
+      typename Base::SharedStorageA& shared_storageA,
+      typename Base::SharedStorageB& shared_storageB,
+      int thread_idx, ///< ID within the threadblock
+      int warp_idx, ///< ID of warp
+      int lane_idx ///< ID of each thread within a warp
+      )
+      : Base(shared_storageA, shared_storageB, thread_idx, warp_idx, lane_idx),
+        smem_iterator_A_(shared_storageA.ref(), thread_idx),
+        smem_iterator_B_(shared_storageB.ref(), thread_idx) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+  CUTLASS_DEVICE
+  CustomMmaPipelined(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage& st,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : CustomMmaPipelined(
+            st.operand_A,
+            st.operand_B,
+            thread_idx,
+            warp_idx,
+            lane_idx) {}
+
+  CUTLASS_DEVICE
+  bool set_prologue_done(bool value) {
+    // NOT IMPLEMENTED FOR PIPELINED
+  }
+
+  CUTLASS_DEVICE
+  bool set_zero_outside_bounds(bool value) {
+    // NOT NEEDED FOR PIPELINED
+    // shared memory will always be zero-filled
+  }
+
+  template <bool kLoadA = true, bool kLoadB = true>
+  CUTLASS_DEVICE static void prologue(
+      typename Base::SharedStorage& shared_storage,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      int thread_idx,
+      int problem_size_k) {
+    prologue<kLoadA, kLoadB>(
+        shared_storage.operand_A,
+        shared_storage.operand_B,
+        iterator_A,
+        iterator_B,
+        thread_idx,
+        problem_size_k);
+  }
+
+  template <bool kLoadA = true, bool kLoadB = true>
+  CUTLASS_DEVICE static void prologue(
+      typename Base::SharedStorageA& shared_storageA,
+      typename Base::SharedStorageB& shared_storageB,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      int thread_idx,
+      int problem_size_k) {
+    // NOT IMPLEMENTED FOR PIPELINED
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      int gemm_k_iterations, ///< number of iterations of the mainloop
+      FragmentC& accum, ///< destination accumulator tile
+      IteratorA iterator_A, ///< iterator over A operand in global memory
+      IteratorB iterator_B, ///< iterator over B operand in global memory
+      FragmentC const& src_accum, ///< source accumulator tile
+      TransformA transform_A =
+          TransformA(), ///< transformation applied to A fragment
+      TransformB transform_B =
+          TransformB()) { ///< transformation applied to B fragment
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    FragmentA tb_frag_A;
+    FragmentB tb_frag_B;
+
+    tb_frag_A.clear();
+    tb_frag_B.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_A.load(tb_frag_A);
+    iterator_B.load(tb_frag_B);
+
+    ++iterator_A;
+    ++iterator_B;
+
+    this->smem_iterator_A_.store(transform_A(tb_frag_A));
+    this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpFragmentA warp_frag_A[2];
+    WarpFragmentB warp_frag_B[2];
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    Operator warp_mma;
+
+    int smem_write_stage_idx = 1;
+
+    // Avoid reading out of bounds
+    iterator_A.clear_mask(gemm_k_iterations <= 1);
+    iterator_B.clear_mask(gemm_k_iterations <= 1);
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER*
+    // issuing shared memory loads (which have the tightest latency
+    // requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+          // Write fragments to shared memory
+          this->smem_iterator_A_.store(transform_A(tb_frag_A));
+
+          this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+          __syncthreads();
+
+          ++this->smem_iterator_A_;
+          ++this->smem_iterator_B_;
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+          } else {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0,
+                 -Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+        }
+
+        this->warp_tile_iterator_A_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations);
+
+        this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k == 0) {
+          iterator_A.load(tb_frag_A);
+          iterator_B.load(tb_frag_B);
+
+          ++iterator_A;
+          ++iterator_B;
+
+          // Avoid reading out of bounds if this was the last loop iteration
+          iterator_A.clear_mask(gemm_k_iterations <= 2);
+          iterator_B.clear_mask(gemm_k_iterations <= 2);
+        }
+
+        warp_mma(
+            accum,
+            warp_frag_A[warp_mma_k % 2],
+            warp_frag_B[warp_mma_k % 2],
+            accum);
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/gemm/find_default_mma.h b/third_party/fused_multi_head_attention/gemm/find_default_mma.h
new file mode 100644
index 0000000000..560da450ff
--- /dev/null
+++ b/third_party/fused_multi_head_attention/gemm/find_default_mma.h
@@ -0,0 +1,191 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Cutlass provides helper template functions to figure out the right
+   datastructures to instantiate to run a GEMM with various parameters (see
+   `cutlass/gemm/threadblock/default_mma.h`). However, due to template
+   instantiation priority rules, it will only create an MmaMultiStage with
+   kStages=3 (otherwise creates an MmePipelined - which is not compatible with
+   FastF32). kStages=3 uses too much shared memory and we want to use kStages=2,
+   so we just copy-pasted some code from `default_mma.h` and
+   `default_mma_core.h` files and wrapped this template to allow our usecase.
+
+    This is really only for the FastF32 case - aka using TensorCores with fp32.
+*/
+
+#pragma once
+
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operand
+    typename LayoutC,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    typename Enable_ = void>
+struct FindDefaultMma {
+  static constexpr bool AccumulatorsInRowMajor = false;
+  static constexpr SharedMemoryClearOption SharedMemoryClear =
+      SharedMemoryClearOption::kNone;
+  using DefaultMma = cutlass::gemm::threadblock::DefaultMma<
+      ElementA,
+      LayoutA,
+      kAlignmentA,
+      ElementB,
+      LayoutB,
+      kAlignmentB,
+      ElementAccumulator,
+      LayoutC,
+      OperatorClass,
+      ArchTag,
+      ThreadblockShape,
+      WarpShape,
+      InstructionShape,
+      Stages,
+      Operator,
+      AccumulatorsInRowMajor,
+      SharedMemoryClear>;
+};
+
+/// Specialization for sm80 / FastF32 / multistage with kStages=2
+template <
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    int kStages,
+    typename Operator>
+struct FindDefaultMma<
+    ElementA_,
+    LayoutA_,
+    kAlignmentA,
+    ElementB_,
+    LayoutB_,
+    kAlignmentB,
+    ElementAccumulator,
+    layout::RowMajor,
+    arch::OpClassTensorOp,
+    arch::Sm80,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    kStages,
+    Operator,
+    typename cutlass::platform::enable_if<(kAlignmentA > 1)>::type> {
+  using LayoutC = layout::RowMajor;
+  using OperatorClass = arch::OpClassTensorOp;
+  using ArchTag = arch::Sm80;
+
+  using DefaultMma_ = cutlass::gemm::threadblock::DefaultMma<
+      ElementA_,
+      LayoutA_,
+      kAlignmentA,
+      ElementB_,
+      LayoutB_,
+      kAlignmentB,
+      ElementAccumulator,
+      LayoutC,
+      OperatorClass,
+      ArchTag,
+      ThreadblockShape,
+      WarpShape,
+      InstructionShape,
+      3,
+      Operator>;
+  struct DefaultMma : DefaultMma_ {
+    using MmaCore_ = typename DefaultMma_::MmaCore;
+    // Define the threadblock-scoped multistage matrix multiply
+    using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
+        typename MmaCore_::Shape,
+        typename DefaultMma_::IteratorA,
+        typename MmaCore_::SmemIteratorA,
+        MmaCore_::kCacheOpA,
+        typename DefaultMma_::IteratorB,
+        typename MmaCore_::SmemIteratorB,
+        MmaCore_::kCacheOpB,
+        ElementAccumulator,
+        LayoutC,
+        typename MmaCore_::MmaPolicy,
+        kStages>;
+  };
+};
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/third_party/fused_multi_head_attention/gemm/mma_accum_lambda_iterator.h b/third_party/fused_multi_head_attention/gemm/mma_accum_lambda_iterator.h
new file mode 100644
index 0000000000..7692389c5c
--- /dev/null
+++ b/third_party/fused_multi_head_attention/gemm/mma_accum_lambda_iterator.h
@@ -0,0 +1,378 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/functional.h"
+#include "cutlass/gemm/warp/mma_simt_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+#include "cutlass/matrix_shape.h"
+
+/*
+TensorCores have different accumulator layouts.
+This file provides a class to easily map the accumulator
+i-th element with the corresponding matrix row/col.
+*/
+
+template <typename T, typename accum_t, int kWarpSize>
+struct AccumLambdaIteratorSm80 {
+  static_assert(
+      cutlass::platform::
+          is_same<typename T::Layout, cutlass::layout::RowMajor>::value,
+      "only RowMajor is supported");
+
+  using Policy = typename T::Policy;
+  using InstructionShape = typename T::InstructionShape;
+  using OpDelta = typename T::OpDelta;
+  using Shape = typename T::Shape;
+  static int const kElementsPerAccess = InstructionShape::kN / 4;
+  static int const kRowsPerTile = 8;
+  static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
+
+  static cutlass::MatrixCoord CUTLASS_DEVICE get_lane_offset(
+      int8_t lane_id,
+      int8_t warp_id,
+      typename T::TensorCoord const& tile_offset) {
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+    return cutlass::MatrixCoord(
+        quad + tile_offset.row() * Shape::kRow,
+        lane_in_quad * kElementsPerAccess +
+            tile_offset.column() * Shape::kColumn);
+  }
+
+  template <typename FA, typename FB, typename FC>
+  CUTLASS_DEVICE static void iterateRows(
+      cutlass::MatrixCoord& lane_offset,
+      FA beginRow,
+      FB op,
+      FC endRow) {
+    // See cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int row = 0; row < kAccumulatorRows; ++row) {
+        int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+            row * kRowsPerTile + lane_offset.row();
+        beginRow(accum_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+          int mma_accum_start = kAccumulatorRows * kElementsPerAccess *
+              (mma_n * Policy::MmaIterations::kRow + mma_m);
+          CUTLASS_PRAGMA_UNROLL
+          for (int col = 0; col < kElementsPerAccess; ++col) {
+            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn +
+                col + lane_offset.column();
+            int idx = mma_accum_start + row * kElementsPerAccess + col;
+            op(accum_m, accum_n, idx);
+          }
+        }
+
+        endRow(accum_m);
+      }
+    }
+  }
+
+  template <typename DT, typename F>
+  CUTLASS_DEVICE static bool reduceSameRow(int lane_id, DT& myValue, F fn) {
+    // In each warp, 4 threads will work on the same row
+    // - the ones with the same `quad`
+    auto otherV = __shfl_xor_sync(0xffffffff, myValue, 1);
+    myValue = fn(myValue, otherV);
+    otherV = __shfl_xor_sync(0xffffffff, myValue, 2);
+    myValue = fn(myValue, otherV);
+    int lane_in_quad = (lane_id & 3);
+    return lane_in_quad == 0;
+  }
+};
+
+template <typename T, typename accum_t, int kWarpSize>
+struct AccumLambdaIteratorSm70 {
+  static_assert(
+      cutlass::platform::
+          is_same<typename T::Layout, cutlass::layout::RowMajor>::value,
+      "only RowMajor is supported");
+
+  using Policy = typename T::Policy;
+  using InstructionShape = typename T::InstructionShape;
+  using OpDelta = typename T::OpDelta;
+  using Shape = typename T::Shape;
+  using Element = accum_t;
+
+  static int const kElementsPerPartial = 4;
+  using EleShapePerPatial = typename cutlass::platform::conditional<
+      cutlass::platform::is_same<Element, float>::value,
+      cutlass::MatrixShape<2, 2>,
+      cutlass::MatrixShape<1, 4>>::type;
+  static int const kElementsPerMma = 8;
+  static int const kAccumulatorPatials = 2;
+  using QuadShapePerPatialMma = cutlass::MatrixShape<4, 4>;
+
+  static cutlass::MatrixCoord CUTLASS_DEVICE get_lane_offset(
+      int8_t lane_id,
+      int8_t warp_id,
+      typename T::TensorCoord const& tile_offset) {
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+    int accum_m, accum_n;
+
+    if (cutlass::platform::is_same<Element, float>::value) {
+      // (quad[2],quad[0])+lane_in_quad[0]
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1);
+      // (quad[1])+lane_in_quad[1]
+      accum_n =
+          ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials +
+          (lane_in_quad & 2);
+    } else {
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 +
+          lane_in_quad; // (quad[2],quad[0])
+      accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials;
+    }
+    return cutlass::MatrixCoord(
+        accum_m + tile_offset.row() * Shape::kRow,
+        accum_n + tile_offset.column() * Shape::kColumn);
+  }
+
+  template <typename DT, typename F>
+  CUTLASS_DEVICE static bool reduceSameRow(int lane_id, DT& myValue, F fn) {
+    static_assert(
+        cutlass::platform::is_same<Element, float>::value,
+        "update to support non-float accum");
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-fragment-mma-884-f16
+    // T0 & T2 share same line within a quad
+    auto otherV = __shfl_xor_sync(0xffffffff, myValue, 1 << 1);
+    myValue = fn(myValue, otherV);
+    // quad 0 and quad 2 are on the same lines
+    otherV = __shfl_xor_sync(0xffffffff, myValue, 1 << 3);
+    myValue = fn(myValue, otherV);
+    return (lane_id & ((1 << 1) | (1 << 3))) == 0;
+  }
+
+  template <typename FA, typename FB, typename FC>
+  CUTLASS_DEVICE static void iterateRows(
+      cutlass::MatrixCoord& lane_offset,
+      FA beginRow,
+      FB op,
+      FC endRow) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < EleShapePerPatial::kRow; ++m) {
+          int accum_m = tile_m * Policy::InterleavedTile::kRow +
+              mma_m * QuadShapePerPatialMma::kRow + m * 2 + lane_offset.row();
+          beginRow(accum_m);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn;
+               ++tile_n) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn;
+                 ++mma_n) {
+              CUTLASS_PRAGMA_UNROLL
+              for (int p = 0; p < kAccumulatorPatials; ++p) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int n = 0; n < EleShapePerPatial::kColumn; ++n) {
+                  int mma_accum_start =
+                      (((tile_n * Policy::TileIterations::kRow + tile_m) *
+                            Policy::MmaIterations::kColumn +
+                        mma_n) *
+                           Policy::MmaIterations::kRow +
+                       mma_m) *
+                      kElementsPerMma;
+                  int accum_n = tile_n * Policy::InterleavedTile::kColumn +
+                      mma_n * QuadShapePerPatialMma::kColumn +
+                      p * Policy::InterleavedTile::kColumn / 2 + n +
+                      lane_offset.column();
+                  int idx = mma_accum_start + p * kElementsPerPartial +
+                      m * EleShapePerPatial::kColumn + n;
+                  op(accum_m, accum_n, idx);
+                }
+              }
+            }
+          }
+          endRow(accum_m);
+        }
+      }
+    }
+  }
+};
+
+template <typename T, typename accum_t, int kWarpSize>
+struct AccumLambdaIteratorSimt {
+  using Policy = typename T::Policy;
+  using Iterations = typename T::Iterations;
+  using Element = typename T::Element;
+  using Delta = typename T::Delta;
+  using Shape = typename T::Shape;
+  static_assert(
+      cutlass::platform::
+          is_same<typename T::Layout, cutlass::layout::RowMajor>::value,
+      "only RowMajor is supported");
+
+  template <typename DT, typename F>
+  CUTLASS_DEVICE static bool reduceSameRow(int lane_id, DT& myValue, F fn) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int bit = 1; bit < Policy::WarpShape::kColumn; bit *= 2) {
+      auto otherV = __shfl_xor_sync(0xffffffff, myValue, bit);
+      myValue = fn(myValue, otherV);
+    }
+    return (lane_id & (Policy::WarpShape::kColumn - 1)) == 0;
+  }
+
+  template <typename FA, typename FB, typename FC>
+  CUTLASS_DEVICE static void iterateRows(
+      cutlass::MatrixCoord& lane_offset,
+      FA beginRow,
+      FB op,
+      FC endRow) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) {
+        int accum_m = mma_m * Delta::kRow + m + lane_offset.row();
+        beginRow(accum_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {
+          int accum_n =
+              mma_n * Policy::WarpShape::kColumn * Policy::LaneMmaShape::kN +
+              lane_offset.column();
+          CUTLASS_PRAGMA_UNROLL
+          for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) {
+            int idx = n +
+                Policy::LaneMmaShape::kN *
+                    (mma_n +
+                     Iterations::kColumn *
+                         (m + mma_m * Policy::LaneMmaShape::kM));
+            op(accum_m, accum_n + n, idx);
+          }
+        }
+        endRow(accum_m);
+      }
+    }
+  }
+
+  static cutlass::MatrixCoord CUTLASS_DEVICE get_lane_offset(
+      int8_t lane_id,
+      int8_t warp_id,
+      typename T::TensorCoord const& tile_offset) {
+    static_assert(
+        cutlass::platform::is_same<
+            typename Policy::LaneLayout,
+            cutlass::layout::RowMajorInterleaved<1>>::value,
+        "");
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    cutlass::MatrixCoord lane_offset = lane_layout.inverse(lane_id) *
+        cutlass::MatrixCoord(Policy::LaneMmaShape::kM,
+                             Policy::LaneMmaShape::kN);
+    return lane_offset +
+        tile_offset * cutlass::MatrixCoord(Shape::kRow, Shape::kColumn);
+  }
+};
+
+template <typename T, typename accum_t, int kWarpSize>
+struct DefaultMmaAccumLambdaIterator;
+
+// Simt
+template <typename S, typename P, typename accum_t, int kWarpSize>
+struct DefaultMmaAccumLambdaIterator<
+    cutlass::gemm::warp::MmaSimtTileIterator<
+        S,
+        cutlass::gemm::Operand::kC,
+        accum_t,
+        cutlass::layout::RowMajor,
+        P,
+        1,
+        1>,
+    accum_t,
+    kWarpSize> {
+  using WarpIterator = typename cutlass::gemm::warp::MmaSimtTileIterator<
+      S,
+      cutlass::gemm::Operand::kC,
+      accum_t,
+      cutlass::layout::RowMajor,
+      P,
+      1,
+      1>;
+  using Iterator = AccumLambdaIteratorSimt<WarpIterator, accum_t, kWarpSize>;
+};
+
+// TensorOp - Volta
+template <typename S1, typename S2, typename accum_t, int kWarpSize>
+struct DefaultMmaAccumLambdaIterator<
+    cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+        S1,
+        accum_t,
+        cutlass::layout::RowMajor,
+        S2,
+        cutlass::MatrixShape<1, 1>>,
+    accum_t,
+    kWarpSize> {
+  using WarpIterator =
+      typename cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+          S1,
+          accum_t,
+          cutlass::layout::RowMajor,
+          S2,
+          cutlass::MatrixShape<1, 1>>;
+  using Iterator = AccumLambdaIteratorSm70<WarpIterator, accum_t, kWarpSize>;
+};
+
+// TensorOp - Sm75+
+template <
+    typename S1,
+    typename S2,
+    typename S3,
+    typename accum_t,
+    int kWarpSize>
+struct DefaultMmaAccumLambdaIterator<
+    cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator<
+        S1,
+        accum_t,
+        cutlass::layout::RowMajor,
+        S2,
+        S3>,
+    accum_t,
+    kWarpSize> {
+  using WarpIterator =
+      typename cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator<
+          S1,
+          accum_t,
+          cutlass::layout::RowMajor,
+          S2,
+          S3>;
+  using Iterator = AccumLambdaIteratorSm80<WarpIterator, accum_t, kWarpSize>;
+};
diff --git a/third_party/fused_multi_head_attention/gemm/mma_from_smem.h b/third_party/fused_multi_head_attention/gemm/mma_from_smem.h
new file mode 100644
index 0000000000..f2b94d0031
--- /dev/null
+++ b/third_party/fused_multi_head_attention/gemm/mma_from_smem.h
@@ -0,0 +1,1955 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tools and utils to store a GEMM output in shmem, and to use that
+   output as operandA for another GEMM back-to-back
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/functional.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/transform/threadblock/vector_iterator.h"
+
+#include "../epilogue/epilogue_thread_apply_logsumexp.h"
+#include "../gemm/mma_accum_lambda_iterator.h"
+#include "../gemm_kernel_utils.h"
+#include "../iterators/default_warp_iterator_from_smem.h"
+#include "../iterators/make_residual_last.h"
+#include "../iterators/transpose_warp_iterator.h"
+#include "../iterators/warp_iterator_from_smem.h"
+#include "cutlass/epilogue/threadblock/epilogue_smem_accumulator.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "cutlass/gemm/threadblock/mma_multistage.h"
+#include "cutlass/gemm/threadblock/mma_pipelined.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/// Shared storage object needed by accumulator
+/// From 13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h
+template <
+    typename Shape_,
+    typename Element_,
+    typename Layout_,
+    typename Padding_>
+class AccumulatorSharedStorage {
+ public:
+  //
+  // Type definitions
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using Padding = Padding_;
+
+  /// Tensor reference to the accumulator
+  using TensorRefAccum = cutlass::TensorRef<Element, Layout>;
+
+  /// Shape of the accumulator matrix in shared memory
+  using ShapeAccum = cutlass::
+      MatrixShape<Shape::kM + Padding::kRow, Shape::kN + Padding::kColumn>;
+
+ public:
+  //
+  // Data members
+  //
+
+  /// Buffer for accumulator
+  cutlass::AlignedBuffer<Element, ShapeAccum::kCount> accum;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Returns a layout object for the Accum matrix
+  CUTLASS_DEVICE
+  static Layout LayoutAccum() {
+    return Layout::packed({ShapeAccum::kRow, ShapeAccum::kColumn});
+  }
+
+  /// Returns a TensorRef to the Accumulator
+  CUTLASS_HOST_DEVICE
+  TensorRefAccum accum_ref() {
+    return TensorRefAccum{accum.data(), LayoutAccum()};
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Taken from
+// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    // Maximum K dimension - also the dimension of the shared-memory
+    // holding `OperandA`
+    int kMaxK_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Layout in shared-memory of operand A
+    typename SmemLayoutA,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaBaseFromSharedMemory {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  static constexpr int kMaxK = kMaxK_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = GemmShape<
+      Shape::kM / WarpGemm::kM,
+      Shape::kN / WarpGemm::kN,
+      Shape::kK / WarpGemm::kK>;
+  using WarpCount1 = WarpCount;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations =
+      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+  static int const kWarpGemmIterations1 = kWarpGemmIterations;
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  /// If this is true, we fill the entire shmem buffer at start
+  /// and don't need to iterate through it in a circular fashion
+  static bool const kSmemContainsEntireB = kMaxK <= Shape::kK * kStages;
+
+  /// Tensor reference to the A operand
+  using TensorRefA = TensorRef<typename Operator::ElementA, SmemLayoutA>;
+
+  /// Tensor reference to the B operand
+  using TensorRefB =
+      TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB = MatrixShape<
+        Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+        Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+   public:
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() {
+      return TensorRefB{operand_B.data(), LayoutB()};
+    }
+  };
+
+ protected:
+  //
+  // Data members
+  //
+
+  // /// Iterator to load a warp-scoped tile of A operand from shared memory
+  // typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaBaseFromSharedMemory(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      TensorRefB& b_tile,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : warp_tile_iterator_B_(b_tile, lane_idx) {}
+};
+
+namespace {
+
+// has necessary trait compliance with WarpIteratorFromSmem but doesn't do
+// anything, can be default initialized, and uses fragment that takes up
+// (almost) no space. this warp iterator is selected at compile time when
+// elementwise on-the-fly scaling for operand A is disabled, in which case
+// operations related to loading scale factors for operand A get wiped out by
+// the compiler.
+template <typename TensorRef>
+class NoOpWarpIteratorScale {
+ public:
+  // in pipelined+multistage MMA implementations we keep an array of fragments.
+  // if we aren't using scaling we don't want to waste registers on fragments
+  // of scale elements, so ideally this would be sized 0.
+  // Since arrays of zero-sized objects are not allowed, using size as 1.
+  // The compiler will most likely wipe it out anyways.
+  using Fragment = cutlass::Array<char, 1>;
+
+  CUTLASS_HOST_DEVICE
+  NoOpWarpIteratorScale() {}
+
+  CUTLASS_HOST_DEVICE
+  NoOpWarpIteratorScale(TensorRef const&, int) {}
+
+  CUTLASS_HOST_DEVICE
+  NoOpWarpIteratorScale& add_tile_offset(
+      typename TensorRef::TensorCoord const&) {
+    return *this;
+  }
+
+  CUTLASS_HOST_DEVICE
+  NoOpWarpIteratorScale& operator++() {
+    return *this;
+  }
+
+  CUTLASS_DEVICE
+  void load(Fragment&) const {}
+};
+
+// if scaling is enabled, performs fragment elementwise multiplication between
+// fragment and its scaling factor.
+template <typename Fragment, typename FragmentScale, bool ScalingEnabled>
+class FragmentElementwiseScaler;
+
+// specialization for scaling being enabled.
+template <typename Fragment, typename FragmentScale>
+class FragmentElementwiseScaler<Fragment, FragmentScale, true> {
+ public:
+  // cast scale_frag to correct type then apply elementwise to fragment
+  CUTLASS_DEVICE
+  static Fragment apply(Fragment frag, FragmentScale const& scale_frag) {
+    Fragment converted_scale_frag = cutlass::NumericArrayConverter<
+        typename Fragment::Element,
+        typename FragmentScale::Element,
+        FragmentScale::kElements>()(scale_frag);
+    return cutlass::multiplies<Fragment>()(frag, converted_scale_frag);
+  }
+};
+
+// specialization for scaling being disabled. doesn't do anything and should
+// just get wiped out by the compiler.
+template <typename Fragment, typename FragmentScale>
+class FragmentElementwiseScaler<Fragment, FragmentScale, false> {
+ public:
+  CUTLASS_DEVICE
+  static Fragment apply(Fragment frag, FragmentScale const&) {
+    return frag;
+  }
+};
+} // namespace
+
+////////////////////////////////////////////////////////////////////////////////
+// Taken from
+// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined_smem_accumulator.h
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    // BEGIN smem
+    /// Iterates over the intermediate accumulator tile in shared memory
+    typename WarpIteratorA_,
+    /// whether or not to perform elementwise multiplication of A
+    //  by another matrix (A_scale) that is also kept in shared memory prior
+    //  to matmul A @ B
+    bool ScaleOperandA_,
+    /// Max GEMM problem size in K dimension
+    int MaxK,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Transformation applied to B operand
+    typename TransformB_ = NumericArrayConverter<
+        typename SmemIteratorB_::Element,
+        typename IteratorB_::Element,
+        IteratorB_::Fragment::kElements>,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaPipelinedFromSharedMemory : public MmaBaseFromSharedMemory<
+                                         Shape_,
+                                         MaxK,
+                                         Policy_,
+                                         2,
+                                         typename WarpIteratorA_::Layout> {
+ public:
+  ///< Base class
+  using Base = MmaBaseFromSharedMemory<
+      Shape_,
+      MaxK,
+      Policy_,
+      2,
+      typename WarpIteratorA_::Layout>;
+
+  using Shape =
+      Shape_; ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  static constexpr bool ScaleOperandA = ScaleOperandA_;
+
+  using WarpIteratorA = WarpIteratorA_;
+  ///< loads fragments of A_scale from shared memory if operand A scaling is
+  ///< enabled. otherwise no-op.
+  using WarpIteratorAScale = typename cutlass::platform::conditional<
+      ScaleOperandA,
+      WarpIteratorA,
+      NoOpWarpIteratorScale<typename WarpIteratorA::TensorRef>>::type;
+
+  using IteratorB =
+      IteratorB_; ///< Iterates over tiles of B operand in global memory
+  using ElementC = ElementC_; ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_; ///< Layout of accumulator matrix
+  using Policy = Policy_; ///< Policy describing tuning details
+
+  using SmemIteratorB = SmemIteratorB_;
+
+  using TransformB = TransformB_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy::Operator::ArchTag;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+  static_assert(
+      (Base::kStages == 2),
+      "MmaPipelined requires kStages set to value 2");
+
+ private:
+  using WarpFragmentA = typename Operator::FragmentA;
+
+  /// fragment type of OperandA elementwise scaling matrix. (almost) empty
+  /// if operand A scaling is disabled.
+  using WarpFragmentAScale = typename WarpIteratorAScale::Fragment;
+
+  using WarpFragmentB = typename Operator::FragmentB;
+
+  /// applies scaling factor to operand A fragment if operand A scaling is
+  /// enabled. otherwise no-op.
+  using FragmentAScaler = FragmentElementwiseScaler<
+      WarpFragmentA,
+      WarpFragmentAScale,
+      ScaleOperandA>;
+
+ protected:
+  // /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  // SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  /// Iterator to load a warp-scoped tile of A operand from intermediate
+  /// accumulator tile
+  WarpIteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of A_scale from intermediate
+  /// accumulator tile (only used if ScaleOperandA_ is true)
+  WarpIteratorAScale warp_tile_iterator_A_scale_;
+
+ public:
+  /// constructor for MMA with operand A scaling enabled.
+  CUTLASS_DEVICE
+  MmaPipelinedFromSharedMemory(
+      typename Base::TensorRefA a, // Operand A in shared memory
+      typename Base::TensorRefA a_scale, // Operand A_scale in shared memory
+      typename Base::TensorRefB
+          b_staging, // staging memory for loading tiles of B
+      int thread_idx,
+      int warp_idx,
+      int lane_idx)
+      : Base(b_staging, thread_idx, warp_idx, lane_idx),
+        warp_tile_iterator_A_(a, lane_idx),
+        warp_tile_iterator_A_scale_(a_scale, lane_idx),
+        smem_iterator_B_(b_staging, thread_idx) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_A_scale_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaPipelinedFromSharedMemory(
+      typename Base::TensorRefA a, ///< Operand A in shared memory
+      typename Base::TensorRefB b_staging, ///< staging memory for loading B
+      int thread_idx, ///< ID within the threadblock
+      int warp_idx, ///< ID of warp
+      int lane_idx) ///< ID of each thread within a warp
+      : Base(b_staging, thread_idx, warp_idx, lane_idx),
+        warp_tile_iterator_A_(a, lane_idx),
+        smem_iterator_B_(b_staging, thread_idx) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  // For API compatibility with MmaMultistageFromSharedMemory
+  // but not supported as it worsens perf: older gpus < sm80 don't
+  // support async transfers and have to waste registers
+  CUTLASS_DEVICE
+  void set_prologue_done(bool value) {}
+  CUTLASS_DEVICE
+  static void prologue(
+      typename Base::SharedStorage& shared_storage,
+      IteratorB iterator_B1,
+      int thread_idx,
+      int problem_size_0_n) {}
+
+  CUTLASS_DEVICE
+  static void drain_cp_asyncs() {}
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      int gemm_k_iterations, ///< number of iterations of the mainloop
+      FragmentC& accum, ///< destination accumulator tile
+      // IteratorA iterator_A,                             ///< iterator over A
+      // operand in global memory
+      IteratorB iterator_B, ///< iterator over B operand in global memory
+      FragmentC const& src_accum, ///< source accumulator tile
+      // TransformA transform_A = TransformA(),            ///< transformation
+      // applied to A fragment
+      TransformB transform_B =
+          TransformB()) { ///< transformation applied to B fragment
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    FragmentB tb_frag_B;
+
+    tb_frag_B.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_B.set_residual_tile(gemm_k_iterations == 1);
+    iterator_B.load(tb_frag_B);
+
+    ++iterator_B;
+
+    this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+    ++this->smem_iterator_B_;
+
+    __syncthreads();
+
+    // remember that WarpFragmentAScale and WarpIteratorAScale are empty/no-op
+    // if scaling is disabled.
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpFragmentA warp_frag_A[2];
+    WarpFragmentAScale warp_frag_A_scale[2];
+    WarpFragmentB warp_frag_B[2];
+    warp_frag_A[0].clear();
+    warp_frag_A_scale[0].clear();
+    warp_frag_B[0].clear();
+
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+    this->warp_tile_iterator_A_scale_.load(warp_frag_A_scale[0]);
+    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_A_scale_;
+    ++this->warp_tile_iterator_B_;
+
+    Operator warp_mma;
+
+    int smem_write_stage_idx = 1;
+
+    // Avoid reading out of bounds
+    iterator_B.set_residual_tile(gemm_k_iterations == 2);
+    iterator_B.clear_mask(gemm_k_iterations <= 1);
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER*
+    // issuing shared memory loads (which have the tightest latency
+    // requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+        bool hasNext = true;
+
+        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+          if (gemm_k_iterations > 1) {
+            // Write fragments to shared memory
+            this->smem_iterator_B_.store(transform_B(tb_frag_B));
+          }
+
+          __syncthreads();
+
+          ++this->smem_iterator_B_;
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory SMEM: Don't reset iterator A, as
+          // we are continuing our iteration at this point
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+          } else {
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+          hasNext = gemm_k_iterations > 1;
+        }
+
+        // Only read the next if we need to
+        if (hasNext) {
+          this->warp_tile_iterator_B_.set_kgroup_index(
+              (warp_mma_k + 1) % Base::kWarpGemmIterations);
+
+          this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+          this->warp_tile_iterator_A_scale_.load(
+              warp_frag_A_scale[(warp_mma_k + 1) % 2]);
+          this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
+
+          ++this->warp_tile_iterator_A_;
+          ++this->warp_tile_iterator_A_scale_;
+          ++this->warp_tile_iterator_B_;
+
+          if (warp_mma_k == 0) {
+            iterator_B.load(tb_frag_B);
+
+            ++iterator_B;
+
+            // Avoid reading out of bounds if this was the last loop iteration
+            iterator_B.set_residual_tile(gemm_k_iterations == 3);
+            iterator_B.clear_mask(gemm_k_iterations <= 2);
+          }
+        }
+
+        warp_mma(
+            accum,
+            FragmentAScaler::apply(
+                warp_frag_A[warp_mma_k % 2], warp_frag_A_scale[warp_mma_k % 2]),
+            warp_frag_B[warp_mma_k % 2],
+            accum);
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Taken from
+// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage_smem_accumulator.h
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape1_,
+    /// Iterates over the intermediate accumulator tile in shared memory
+    typename WarpIteratorA1_,
+    /// whether or not to perform elementwise multiplication of A
+    //  by another matrix (A_scale) that is also kept in shared memory prior
+    //  to matmul A @ B
+    bool ScaleOperandA_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB1_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB1_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB1,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy1_,
+    /// Number of stages,
+    int Stages_,
+    int kMaxK_,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaMultistageFromSharedMemory : public MmaBaseFromSharedMemory<
+                                          Shape1_,
+                                          kMaxK_,
+                                          Policy1_,
+                                          Stages_,
+                                          typename WarpIteratorA1_::Layout> {
+ public:
+  ///< Base class
+  using Base = MmaBaseFromSharedMemory<
+      Shape1_,
+      kMaxK_,
+      Policy1_,
+      Stages_,
+      typename WarpIteratorA1_::Layout>;
+
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape1 = Shape1_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB1 = IteratorB1_;
+  using IteratorB = IteratorB1;
+  ///< Policy describing tuning details
+  using Policy1 = Policy1_;
+
+  using SmemIteratorB1 = SmemIteratorB1_;
+  using WarpIteratorA1 = WarpIteratorA1_; ///< Iterates over the intermediate
+                                          ///< accumulator tile in shared memory
+  static constexpr bool ScaleOperandA = ScaleOperandA_;
+
+  ///< warp level iterator over A_scale matrix tile kept in shared memory.
+  ///< if elementwise A scaling is disabled then everything this does is no-op.
+  using WarpIteratorAScale = typename cutlass::platform::conditional<
+      ScaleOperandA,
+      WarpIteratorA1,
+      NoOpWarpIteratorScale<typename WarpIteratorA1::TensorRef>>::type;
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB1 = CacheOpB1;
+  static constexpr bool kSmemContainsEntireB = Base::kSmemContainsEntireB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC1 = typename Policy1::Operator::FragmentC;
+  using FragmentC = FragmentC1;
+
+  /// Warp-level Mma
+  using Operator1 = typename Policy1::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB1 = Operator1::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+    static_assert(
+        Base::kWarpGemmIterations1 > 1,
+        "The pipelined structure requires at least two warp-level "
+        "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const TBLoadIterationsB1 =
+        IteratorB1::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB1 =
+        (TBLoadIterationsB1 + Base::kWarpGemmIterations1 - 1) /
+        Base::kWarpGemmIterations1;
+  };
+
+  static constexpr int kNumStagesConcurrentLoad =
+      kSmemContainsEntireB ? Base::kStages : Base::kStages - 1;
+
+ private:
+  using WarpLoadedFragmentA1 = typename Operator1::FragmentA;
+  /// fragment of OperandA scale matrix. if operand A scaling is disabled this
+  /// is (almost) empty.
+  using WarpLoadedFragmentA1Scale = typename WarpIteratorAScale::Fragment;
+  using WarpLoadedFragmentB1 = typename Operator1::FragmentB;
+  using WarpTransformedFragmentA1 = typename Operator1::TransformedFragmentA;
+  using WarpTransformedFragmentB1 = typename Operator1::TransformedFragmentB;
+
+  /// applies elementwise scaling to fragment of A. if operand A scaling is
+  /// disabled this is a no-op.
+  using FragmentAScaler = FragmentElementwiseScaler<
+      WarpLoadedFragmentA1,
+      WarpLoadedFragmentA1Scale,
+      ScaleOperandA>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A1 operand from intermediate
+  /// accumulator tile
+  WarpIteratorA1 warp_tile_iterator_A1_;
+
+  /// Iterator to load a warp-scoped tile of A1_scale operand from shared memory
+  /// if operand A scaling is disabled everything this does is a no-op.
+  WarpIteratorAScale warp_tile_iterator_A1_scale_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB1 smem_iterator_B1_;
+
+  bool prologue_done_;
+
+ public:
+  /// constructor for MMA with operand A scaling enabled.
+  CUTLASS_DEVICE
+  MmaMultistageFromSharedMemory(
+      typename Base::TensorRefA a,
+      typename Base::TensorRefA a_scale,
+      typename Base::TensorRefB b_tile,
+      int thread_idx,
+      int warp_idx,
+      int lane_idx)
+      : Base(b_tile, thread_idx, warp_idx, lane_idx),
+        warp_tile_iterator_A1_(a, lane_idx),
+        warp_tile_iterator_A1_scale_(a_scale, lane_idx),
+        smem_iterator_B1_(b_tile, thread_idx),
+        prologue_done_(false) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+    int warp_idx_mn_1 =
+        warp_idx % (Base::WarpCount1::kM * Base::WarpCount1::kN);
+    int warp_idx_k_1 = warp_idx / (Base::WarpCount1::kM * Base::WarpCount1::kN);
+    int warp_idx_m_1 = warp_idx_mn_1 % Base::WarpCount1::kM;
+    int warp_idx_n_1 = warp_idx_mn_1 / Base::WarpCount1::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    warp_tile_iterator_A1_.add_tile_offset(
+        {warp_idx_m_1, Base::kWarpGemmIterations1 * warp_idx_k_1});
+    warp_tile_iterator_A1_scale_.add_tile_offset(
+        {warp_idx_m_1, Base::kWarpGemmIterations1 * warp_idx_k_1});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations1 * warp_idx_k_1, warp_idx_n_1});
+  }
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaMultistageFromSharedMemory(
+      typename Base::TensorRefA a,
+      typename Base::TensorRefB b_tile,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : Base(b_tile, thread_idx, warp_idx, lane_idx),
+        warp_tile_iterator_A1_(a, lane_idx),
+        smem_iterator_B1_(b_tile, thread_idx),
+        prologue_done_(false) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn_1 =
+        warp_idx % (Base::WarpCount1::kM * Base::WarpCount1::kN);
+    int warp_idx_k_1 = warp_idx / (Base::WarpCount1::kM * Base::WarpCount1::kN);
+
+    int warp_idx_m_1 = warp_idx_mn_1 % Base::WarpCount1::kM;
+    int warp_idx_n_1 = warp_idx_mn_1 / Base::WarpCount1::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    warp_tile_iterator_A1_.add_tile_offset(
+        {warp_idx_m_1, Base::kWarpGemmIterations1 * warp_idx_k_1});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations1 * warp_idx_k_1, warp_idx_n_1});
+  }
+
+  CUTLASS_DEVICE
+  void set_prologue_done(bool value) {
+    prologue_done_ = value;
+  }
+
+  CUTLASS_DEVICE
+  static void prologue(
+      typename Base::SharedStorage& shared_storage,
+      IteratorB iterator_B1,
+      int thread_idx,
+      int problem_size_0_n) {
+    SmemIteratorB1 smem_iterator_B1(shared_storage.operand_B_ref(), thread_idx);
+    _prologue(
+        iterator_B1,
+        (problem_size_0_n + Base::Shape::kK - 1) / Base::Shape::kK,
+        smem_iterator_B1);
+  }
+
+  CUTLASS_DEVICE
+  static void drain_cp_asyncs() {
+    // commit and drain all pending and predicated cp.async pnz from the GEMM
+    // mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance_1(
+      IteratorB1& iterator_B1,
+      int group_start_B1 = 0) {
+    iterator_B1.set_iteration_index(
+        group_start_B1 * IteratorB1::kAccessesPerVector);
+    this->smem_iterator_B1_.set_iteration_index(group_start_B1);
+
+    // Load for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB1; ++j) {
+      if (group_start_B1 + j < Detail::TBLoadIterationsB1) {
+        typename IteratorB1::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB1::AccessType*>(
+                this->smem_iterator_B1_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB1::Element>::value *
+            IteratorB1::ThreadMap::kElementsPerAccess /
+            IteratorB1::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B1.get();
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
+              dst_ptr + v, gmem_ptr, iterator_B1.valid());
+
+          ++iterator_B1;
+        }
+        ++this->smem_iterator_B1_;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  static void _prologue(
+      IteratorB& iterator_B1,
+      int32_t gemm_k_iterations_1,
+      SmemIteratorB1& smem_iterator_B1_) {
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < kNumStagesConcurrentLoad;
+         ++stage, --gemm_k_iterations_1) {
+      iterator_B1.set_residual_tile(gemm_k_iterations_1 == 1);
+      iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
+
+      iterator_B1.set_iteration_index(0);
+      smem_iterator_B1_.set_iteration_index(0);
+
+      // Load for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLoadIterationsB1; ++j) {
+        typename IteratorB1::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB1::AccessType*>(
+                smem_iterator_B1_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB1::Element>::value *
+              IteratorB1::ThreadMap::kElementsPerAccess /
+              IteratorB1::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
+              dst_ptr + v, iterator_B1.get(), iterator_B1.valid());
+
+          ++iterator_B1;
+        }
+
+        ++smem_iterator_B1_;
+      }
+
+      // Move to the next stage
+      iterator_B1.add_tile_offset({1, 0});
+
+      smem_iterator_B1_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+    iterator_B1.set_residual_tile(gemm_k_iterations_1 == 1);
+    iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations_1_,
+      ///< destination accumulator tile
+      FragmentC1& accum,
+      ///< iterator over B1 operand in global memory
+      IteratorB1 iterator_B1,
+      ///< initial value of accumulator
+      FragmentC1 const& src_accum) {
+    // 2nd Gemm
+
+    //
+    // Prologue
+    //
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    if (!prologue_done_) {
+      _prologue(iterator_B1, gemm_k_iterations_1_, smem_iterator_B1_);
+    } else if (!kSmemContainsEntireB) {
+      // Restore the iterators increments
+
+      int gemm_k_iterations_1 = gemm_k_iterations_1_;
+      // Issue several complete stages
+      CUTLASS_PRAGMA_UNROLL
+      for (int stage = 0; stage < kNumStagesConcurrentLoad;
+           ++stage, --gemm_k_iterations_1) {
+        iterator_B1.set_iteration_index(0);
+        this->smem_iterator_B1_.set_iteration_index(0);
+
+        // Load for operand B
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 0; j < Detail::TBLoadIterationsB1; ++j) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+            ++iterator_B1;
+          }
+          ++this->smem_iterator_B1_;
+        }
+        iterator_B1.add_tile_offset({1, 0});
+        this->smem_iterator_B1_.add_tile_offset({1, 0});
+      }
+      iterator_B1.set_residual_tile(gemm_k_iterations_1 <= 1);
+      iterator_B1.clear_mask(gemm_k_iterations_1 <= 0);
+    }
+
+    // DEPBAR+SYNC
+    cutlass::arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+    __syncthreads();
+
+    // remember that WarpFragmentAScale and WarpIteratorAScale are no-op/empty
+    // if scaling is disabled.
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA1 warp_loaded_frag_A1[2];
+    WarpLoadedFragmentA1Scale warp_loaded_frag_A1_scale[2];
+    WarpLoadedFragmentB1 warp_loaded_frag_B1[2];
+    WarpTransformedFragmentA1 warp_transformed_frag_A1[2];
+    WarpTransformedFragmentB1 warp_transformed_frag_B1[2];
+
+    Operator1 warp_mma1;
+
+    warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0]);
+    ++warp_tile_iterator_A1_;
+
+    warp_tile_iterator_A1_scale_.load(warp_loaded_frag_A1_scale[0]);
+    ++warp_tile_iterator_A1_scale_;
+
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B1[0]);
+    ++this->warp_tile_iterator_B_;
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma1.transform(
+        warp_transformed_frag_A1[0],
+        warp_transformed_frag_B1[0],
+        FragmentAScaler::apply(
+            warp_loaded_frag_A1[0], warp_loaded_frag_A1_scale[0]),
+        warp_loaded_frag_B1[0]);
+
+    // tf32x3 kernels use staging accumulation. warp_mma uses a temporary
+    // accumulator and this temporary accumulator is added to the final
+    // accumulator once in every mainloop iteration.
+    plus<FragmentC1> plus_accum;
+
+    FragmentC1 tmp_accum;
+
+    if (platform::is_same<
+            typename Operator1::MathOperator,
+            arch::OpMultiplyAddFastF32>::value ||
+        platform::is_same<
+            typename Operator1::MathOperator,
+            arch::OpMultiplyAddComplexFastF32>::value) {
+      tmp_accum.clear();
+    }
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int gemm_k_iterations_1 = gemm_k_iterations_1_ - (Base::kStages - 1);
+         gemm_k_iterations_1 > (-Base::kStages + 1);
+         gemm_k_iterations_1--) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1;
+           ++warp_mma_k) {
+        // Load warp-level tile from accumulator fragment (A)
+        // or shared memory (operand B)
+        this->warp_tile_iterator_B_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations1);
+        // skip warp tile loading for the last kgroup (we are out of the buf)
+        if (gemm_k_iterations_1 > (-Base::kStages + 2) ||
+            warp_mma_k < Base::kWarpGemmIterations1 - 1) {
+          warp_tile_iterator_A1_.load(
+              warp_loaded_frag_A1[(warp_mma_k + 1) % 2]);
+          warp_tile_iterator_A1_scale_.load(
+              warp_loaded_frag_A1_scale[(warp_mma_k + 1) % 2]);
+          this->warp_tile_iterator_B_.load(
+              warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+        }
+        ++warp_tile_iterator_A1_;
+        ++warp_tile_iterator_A1_scale_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0)
+          warp_mma1.transform(
+              warp_transformed_frag_A1[warp_mma_k % 2],
+              warp_transformed_frag_B1[warp_mma_k % 2],
+              FragmentAScaler::apply(
+                  warp_loaded_frag_A1[warp_mma_k % 2],
+                  warp_loaded_frag_A1_scale[warp_mma_k % 2]),
+              warp_loaded_frag_B1[warp_mma_k % 2]);
+
+        if (platform::is_same<
+                typename Operator1::MathOperator,
+                arch::OpMultiplyAddFastF32>::value ||
+            platform::is_same<
+                typename Operator1::MathOperator,
+                arch::OpMultiplyAddComplexFastF32>::value) {
+          warp_mma1(
+              tmp_accum,
+              warp_transformed_frag_A1[warp_mma_k % 2],
+              warp_transformed_frag_B1[warp_mma_k % 2],
+              tmp_accum);
+
+          if (warp_mma_k == 0) {
+            accum = plus_accum(accum, tmp_accum);
+            tmp_accum.clear();
+          }
+        } else {
+          warp_mma1(
+              accum,
+              warp_transformed_frag_A1[warp_mma_k % 2],
+              warp_transformed_frag_B1[warp_mma_k % 2],
+              accum);
+        }
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations1 - 1) {
+          int group_start_iteration_B1;
+
+          group_start_iteration_B1 = warp_mma_k * Detail::kAccessesPerGroupB1;
+
+          if (!kSmemContainsEntireB) {
+            copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
+          }
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations1) {
+          int group_start_iteration_B1;
+          group_start_iteration_B1 =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB1;
+
+          if (!kSmemContainsEntireB) {
+            copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
+          }
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_B1.add_tile_offset({1, 0});
+
+          this->smem_iterator_B1_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (!kSmemContainsEntireB) {
+            if (smem_write_stage_idx == (Base::kStages - 1)) {
+              this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
+              smem_write_stage_idx = 0;
+            } else {
+              ++smem_write_stage_idx;
+            }
+
+            if (smem_read_stage_idx == (Base::kStages - 1)) {
+              this->warp_tile_iterator_B_.add_tile_offset(
+                  {-Base::kStages * Policy1::kPartitionsK *
+                       Base::kWarpGemmIterations1,
+                   0});
+              smem_read_stage_idx = 0;
+            } else {
+              ++smem_read_stage_idx;
+            }
+          }
+
+          iterator_B1.set_residual_tile(gemm_k_iterations_1 == 2);
+          iterator_B1.clear_mask(gemm_k_iterations_1 == 1);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations1)
+          warp_mma1.transform(
+              warp_transformed_frag_A1[(warp_mma_k + 1) % 2],
+              warp_transformed_frag_B1[(warp_mma_k + 1) % 2],
+              FragmentAScaler::apply(
+                  warp_loaded_frag_A1[(warp_mma_k + 1) % 2],
+                  warp_loaded_frag_A1_scale[(warp_mma_k + 1) % 2]),
+              warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+      }
+    }
+
+    if (platform::is_same<
+            typename Operator1::MathOperator,
+            arch::OpMultiplyAddFastF32>::value ||
+        platform::is_same<
+            typename Operator1::MathOperator,
+            arch::OpMultiplyAddComplexFastF32>::value) {
+      accum = plus_accum(accum, tmp_accum);
+    }
+  }
+};
+
+// Converts a "regular" Mma into their counterpart from shared memory
+template <
+    typename Mma_,
+    int kMaxK,
+    typename WarpIteratorA_,
+    /// whether or not to apply elementwise multiplication of operand A by
+    /// another matrix in shared memory before usage in A @ B
+    bool kScaleOperandA,
+    bool kTransposeA = false>
+struct DefaultMmaFromSharedMemory;
+
+// Mma pipelined
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    typename WarpIteratorA_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Transformation applied to A operand
+    typename TransformA_,
+    /// Transformation applied to B operand
+    typename TransformB_,
+    // Max MMA problem size K
+    int kMaxK,
+    /// whether or not to apply elementwise multiplication of operand A by
+    /// another matrix in shared memory before usage in A @ B
+    bool kScaleOperandA,
+    bool kTransposeA>
+struct DefaultMmaFromSharedMemory<
+    MmaPipelined<
+        Shape_,
+        IteratorA_,
+        SmemIteratorA_,
+        IteratorB_,
+        SmemIteratorB_,
+        ElementC_,
+        LayoutC_,
+        Policy_,
+        TransformA_,
+        TransformB_>,
+    kMaxK,
+    WarpIteratorA_,
+    kScaleOperandA,
+    kTransposeA> {
+  using RegularMma = MmaPipelined<
+      Shape_,
+      IteratorA_,
+      SmemIteratorA_,
+      IteratorB_,
+      SmemIteratorB_,
+      ElementC_,
+      LayoutC_,
+      Policy_,
+      TransformA_,
+      TransformB_>;
+
+  using WarpShape = typename Policy_::Operator::Shape;
+  using InstructionShape = typename Policy_::Operator::InstructionShape;
+  using ArchMmaOperator = typename Policy_::Operator;
+
+  static constexpr bool kIsTransposedA = false;
+  using WarpIteratorA = WarpIteratorA_;
+  using IteratorB =
+      typename cutlass::transform::threadblock::MakeIteratorResidualLast<
+          IteratorB_>::Iterator;
+
+  using Mma = typename cutlass::gemm::threadblock::MmaPipelinedFromSharedMemory<
+      Shape_,
+      WarpIteratorA,
+      kScaleOperandA,
+      kMaxK,
+      IteratorB,
+      SmemIteratorB_,
+      ElementC_,
+      LayoutC_,
+      Policy_>;
+};
+
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    typename WarpIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    int kMaxK,
+    /// whether or not to apply elementwise multiplication of operand A by
+    /// another matrix in shared memory before usage in A @ B
+    bool kScaleOperandA,
+    bool kTransposeA>
+struct DefaultMmaFromSharedMemory<
+    MmaMultistage<
+        Shape_,
+        IteratorA_,
+        SmemIteratorA_,
+        CacheOpA,
+        IteratorB_,
+        SmemIteratorB_,
+        CacheOpB,
+        ElementC_,
+        LayoutC_,
+        Policy_,
+        Stages,
+        SharedMemoryClear>,
+    kMaxK,
+    WarpIteratorA_,
+    kScaleOperandA,
+    kTransposeA> {
+  using RegularMma = MmaMultistage<
+      Shape_,
+      IteratorA_,
+      SmemIteratorA_,
+      CacheOpA,
+      IteratorB_,
+      SmemIteratorB_,
+      CacheOpB,
+      ElementC_,
+      LayoutC_,
+      Policy_,
+      Stages,
+      SharedMemoryClear>;
+
+  using WarpShape = typename Policy_::Operator::Shape;
+  using InstructionShape = typename Policy_::Operator::InstructionShape;
+  using WarpIteratorTranspose = TransposeWarpIterator<WarpIteratorA_>;
+  static constexpr bool kIsTransposedA =
+      WarpIteratorTranspose::kSupportsTranspose && kTransposeA;
+  using WarpIteratorA = typename platform::conditional<
+      kIsTransposedA,
+      typename WarpIteratorTranspose::Iterator,
+      WarpIteratorA_>::type;
+
+  // Reduce the number of stages if we don't need that many
+  static int constexpr kStagesMax =
+      (kMaxK + int(Shape_::kK) - 1) / int(Shape_::kK);
+  static int constexpr kStages = cutlass::const_min(Stages, kStagesMax);
+
+  using IteratorB =
+      typename cutlass::transform::threadblock::MakeIteratorResidualLast<
+          IteratorB_>::Iterator;
+  using Mma =
+      typename cutlass::gemm::threadblock::MmaMultistageFromSharedMemory<
+          Shape_,
+          WarpIteratorA,
+          kScaleOperandA,
+          IteratorB,
+          SmemIteratorB_,
+          RegularMma::kCacheOpB,
+          ElementC_,
+          LayoutC_,
+          Policy_,
+          kStages,
+          kMaxK>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    typename IteratorC,
+    typename Operator,
+    typename scalar_t,
+    typename WarpShape_,
+    typename ThreadblockShape_>
+struct B2bGemm;
+
+// Tensor Cores >= Sm75 specialization (Ampere ...)
+template < /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Element type
+    typename Element_,
+    /// Layout of operand in memory
+    typename Layout_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_,
+    typename Operator,
+    typename scalar_t,
+    typename WarpShape_,
+    typename ThreadblockShape_>
+struct B2bGemm<
+    cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator<
+        Shape_,
+        Element_,
+        Layout_,
+        InstructionShape_,
+        OpDelta_>,
+    Operator,
+    scalar_t,
+    WarpShape_,
+    ThreadblockShape_> {
+  using IteratorC =
+      typename cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator<
+          Shape_,
+          Element_,
+          Layout_,
+          InstructionShape_,
+          OpDelta_>;
+  using FragmentC = typename IteratorC::Fragment;
+  using InstructionShape = InstructionShape_;
+  using WarpShape = WarpShape_;
+  using ThreadblockShape = ThreadblockShape_;
+  using accum_t = Element_;
+  using lse_scalar_t = float;
+
+  using SmemAccumulatorLayout = cutlass::layout::RowMajor;
+
+  // Iterator to load accumulators (results of matmul in registers)
+  using FragmentIteratorAccumulator =
+      cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          WarpShape,
+          InstructionShape,
+          accum_t,
+          typename Operator::Policy::Operator::FragmentC,
+          cutlass::layout::RowMajor>;
+
+  // Iterator to store to shared-memory
+  using SmemIteratorD0 = typename cutlass::epilogue::warp::TileIteratorTensorOp<
+      WarpShape,
+      InstructionShape,
+      scalar_t, // accum_t,
+      SmemAccumulatorLayout>;
+  using AccumulatorSharedStorage =
+      cutlass::gemm::threadblock::AccumulatorSharedStorage<
+          ThreadblockShape,
+          typename SmemIteratorD0::Element,
+          typename SmemIteratorD0::TensorLayout,
+          typename SmemIteratorD0::Padding>;
+  // We need to provide an operation for the epilogue. Let's create an
+  // operation that does nothing (ScaleType::Nothing), just converts
+  // from accum_t (float) -> scalar_t (can be half)
+  using OutputOpNoOp = cutlass::epilogue::thread::LinearCombination<
+      typename SmemIteratorD0::Element, // ElementOutput
+      FragmentIteratorAccumulator::Fragment::kElements,
+      accum_t, // ElementAccumulator
+      typename SmemIteratorD0::Element, // ElementCompute
+      cutlass::epilogue::thread::ScaleType::Nothing>;
+  using Epilogue = cutlass::epilogue::threadblock::EpilogueSmemAccumulator<
+      SmemIteratorD0,
+      FragmentIteratorAccumulator,
+      SmemIteratorD0, // ScaleBiasIterator - not used
+      OutputOpNoOp>;
+
+  // Epilogue 2: with LSE (for backwards pass)
+  static int const kElementsPerAccess = 2; // TODO: Why 2?
+  using IteratorAccumulatorLSE =
+      cutlass::transform::threadblock::VectorIterator<
+          cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+              // Shape
+              cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kN>,
+              // WarpShape
+              cutlass::MatrixShape<WarpShape::kM, WarpShape::kN>,
+              lse_scalar_t,
+              cutlass::layout::RowMajor,
+              kElementsPerAccess>>;
+  using EpilogueOpApplyLSE = cutlass::epilogue::thread::ApplyLogSumExp<
+      scalar_t, // ElementOutput_
+      lse_scalar_t, // ElementLSE_
+      accum_t, // ElementAccumulator_
+      accum_t, // ElementCompute_
+      128 / cutlass::sizeof_bits<scalar_t>::value
+      // FragmentIteratorAccumulator::Fragment::kElements
+      // InstructionShape::kM * InstructionShape::kN / 32
+      >;
+  using EpilogueWithLSE =
+      cutlass::epilogue::threadblock::EpilogueSmemAccumulator<
+          SmemIteratorD0,
+          FragmentIteratorAccumulator,
+          IteratorAccumulatorLSE,
+          EpilogueOpApplyLSE>;
+
+  static void CUTLASS_DEVICE accumToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      FragmentC const& accum,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    SmemIteratorD0 smem_iterator_attn(shared_storage.accum_ref(), lane_id);
+    smem_iterator_attn.add_tile_offset(
+        tile_coords *
+        cutlass::MatrixCoord{
+            SmemIteratorD0::TileIterations::kRow,
+            SmemIteratorD0::TileIterations::kColumn});
+    Epilogue epilogue;
+    epilogue(OutputOpNoOp({}), smem_iterator_attn, accum);
+  }
+
+  static void CUTLASS_DEVICE accumApplyLSEToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      FragmentC& accum,
+      lse_scalar_t const* lse,
+      int32_t lse_extents,
+      int thread_id,
+      int warp_id,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    constexpr int32_t kAlignLSE = 32;
+    IteratorAccumulatorLSE iterator_lse(
+        lse,
+        {(int32_t)0, (int32_t)ceil_div(lse_extents, kAlignLSE) * kAlignLSE},
+        thread_id,
+        warp_id,
+        cutlass::MatrixCoord{0, 0} // offset
+    );
+
+    SmemIteratorD0 smem_iterator_attn(shared_storage.accum_ref(), lane_id);
+    smem_iterator_attn.add_tile_offset(
+        tile_coords *
+        cutlass::MatrixCoord{
+            SmemIteratorD0::TileIterations::kRow,
+            SmemIteratorD0::TileIterations::kColumn});
+    EpilogueWithLSE epilogue;
+    EpilogueOpApplyLSE minus_lse_exp({});
+    epilogue(
+        minus_lse_exp,
+        smem_iterator_attn,
+        accum,
+        // scale - unused
+        iterator_lse,
+        // bias
+        iterator_lse);
+  }
+};
+
+// Volta Specialization
+// only supported for f16
+template <typename Operator, typename WarpShape_, typename ThreadblockShape_>
+struct B2bGemm<
+    cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+        cutlass::MatrixShape<32, 32>,
+        float,
+        cutlass::layout::RowMajor,
+        cutlass::gemm::GemmShape<16, 16, 4>,
+        cutlass::MatrixShape<1, 1>>,
+    Operator,
+    cutlass::half_t,
+    WarpShape_,
+    ThreadblockShape_> {
+  using IteratorC =
+      cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+          cutlass::MatrixShape<32, 32>,
+          float,
+          cutlass::layout::RowMajor,
+          cutlass::gemm::GemmShape<16, 16, 4>,
+          cutlass::MatrixShape<1, 1>>;
+  using scalar_t = cutlass::half_t;
+  using accum_t = IteratorC::Element;
+  using WarpShape = WarpShape_;
+  using ThreadblockShape = ThreadblockShape_;
+  using FragmentC = IteratorC::Fragment;
+  using lse_scalar_t = float;
+
+  // Storage in shared-memory for Q.Kt
+  using SmemAccumulatorLayout =
+      cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<16, 32>;
+  using AccumulatorSharedStorage =
+      cutlass::gemm::threadblock::AccumulatorSharedStorage<
+          ThreadblockShape,
+          scalar_t,
+          SmemAccumulatorLayout,
+          cutlass::MatrixShape<0, 0> // Padding
+          >;
+  using TensorRef = cutlass::TensorRef<scalar_t, SmemAccumulatorLayout>;
+  using Policy = typename IteratorC::Policy;
+  using Element = accum_t;
+  // Those are MmaVoltaTensorOpAccumulatorTileIterator private fields
+  // Let's copy their values
+  static int const kElementsPerPartial = 4;
+  using EleShapePerPatial = typename cutlass::platform::conditional<
+      cutlass::platform::is_same<Element, float>::value,
+      cutlass::MatrixShape<2, 2>,
+      cutlass::MatrixShape<1, 4>>::type;
+  static int const kElementsPerMma = 8;
+  static int const kAccumulatorPatials = 2;
+  using QuadShapePerPatialMma = cutlass::MatrixShape<4, 4>;
+
+  static void CUTLASS_DEVICE accumToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      FragmentC const& accum,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    // ctor - from MmaVoltaTensorOpAccumulatorTileIterator
+    TensorRef ref_(shared_storage.accum_ref());
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+    int accum_m, accum_n;
+
+    if (cutlass::platform::is_same<Element, float>::value) {
+      // (quad[2],quad[0])+lane_in_quad[0]
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1);
+      // (quad[1])+lane_in_quad[1]
+      accum_n =
+          ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials +
+          (lane_in_quad & 2);
+    } else {
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 +
+          lane_in_quad; // (quad[2],quad[0])
+      accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials;
+    }
+    cutlass::MatrixCoord lane_offset(accum_m, accum_n);
+
+    // Tile offset
+    ref_.add_coord_offset(
+        tile_coords *
+        cutlass::MatrixCoord(
+            {IteratorC::Shape::kRow, IteratorC::Shape::kColumn}));
+
+    using AccessType = cutlass::Array<scalar_t, EleShapePerPatial::kColumn>;
+
+    // store - from MmaVoltaTensorOpAccumulatorTileIterator
+    CUTLASS_PRAGMA_UNROLL
+    for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+            int mma_accum_start =
+                (((tile_n * Policy::TileIterations::kRow + tile_m) *
+                      Policy::MmaIterations::kColumn +
+                  mma_n) *
+                     Policy::MmaIterations::kRow +
+                 mma_m) *
+                kElementsPerMma;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int p = 0; p < kAccumulatorPatials; ++p) {
+              CUTLASS_PRAGMA_UNROLL
+              for (int m = 0; m < EleShapePerPatial::kRow; ++m) {
+                int accum_m = tile_m * Policy::InterleavedTile::kRow +
+                    mma_m * QuadShapePerPatialMma::kRow + m * 2;
+                int accum_n = tile_n * Policy::InterleavedTile::kColumn +
+                    mma_n * QuadShapePerPatialMma::kColumn +
+                    p * Policy::InterleavedTile::kColumn / 2;
+                int r = (accum_m + lane_offset.row());
+                AccessType to_store;
+                CUTLASS_PRAGMA_UNROLL
+                for (int n = 0; n < EleShapePerPatial::kColumn; ++n) {
+                  int idx = mma_accum_start + p * kElementsPerPartial +
+                      m * EleShapePerPatial::kColumn + n;
+                  int c = (accum_n + n + lane_offset.column());
+                  to_store[n] = scalar_t(accum[idx]);
+                }
+                int c = (accum_n + lane_offset.column());
+                assert(r < 32);
+                assert(c < 32);
+                *reinterpret_cast<AccessType*>(
+                    ref_.data() + ref_.offset({r, c})) = to_store;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  static void CUTLASS_DEVICE accumApplyLSEToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      typename IteratorC::Fragment& accum,
+      lse_scalar_t const* lse,
+      int lse_extent,
+      int thread_id,
+      int warp_id,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    // Non-optimized way to apply LSE to registers
+    // NOTE: accum is attn.T
+    // TODO: Optimize for each architecture
+    static constexpr int WarpSize = 32;
+    using AccumLambdaIterator =
+        typename DefaultMmaAccumLambdaIterator<IteratorC, accum_t, WarpSize>::
+            Iterator;
+    auto lane_offset =
+        AccumLambdaIterator::get_lane_offset(lane_id, warp_id, tile_coords);
+
+    cutlass::Array<lse_scalar_t, IteratorC::Fragment::kElements> lse_prefetched;
+    lse_prefetched.clear();
+    int rowIdx = 0;
+    int colIdx = 0;
+    AccumLambdaIterator::iterateRows(
+        lane_offset,
+        [&](int accum_m) {
+          ++rowIdx;
+          colIdx = 0;
+        },
+        [&](int accum_m, int accum_n, int idx) {
+          if (rowIdx == 1) {
+            lse_prefetched[colIdx] = accum_n < lse_extent
+                ? lse[accum_n]
+                : cutlass::platform::numeric_limits<accum_t>::infinity();
+          }
+          accum[idx] = expf(accum[idx] - lse_prefetched[colIdx]);
+          ++colIdx;
+        },
+        [&](int accum_m) {});
+    accumToSmem(shared_storage, accum, lane_id, tile_coords);
+  }
+};
+
+// Simt Specialization
+// for f32 on Sm70-Sm75 and f16/f32 below
+
+template <
+    typename Operator,
+    typename OperatorPolicy,
+    typename scalar_t,
+    typename WarpShape_,
+    typename ThreadblockShape_>
+struct B2bGemm<
+    cutlass::gemm::warp::MmaSimtTileIterator<
+        cutlass::MatrixShape<32, 32>,
+        cutlass::gemm::Operand::kC,
+        float,
+        cutlass::layout::RowMajor,
+        OperatorPolicy,
+        1,
+        1>,
+    Operator,
+    scalar_t,
+    WarpShape_,
+    ThreadblockShape_> {
+  using IteratorC = cutlass::gemm::warp::MmaSimtTileIterator<
+      cutlass::MatrixShape<32, 32>,
+      cutlass::gemm::Operand::kC,
+      float,
+      cutlass::layout::RowMajor,
+      OperatorPolicy,
+      1,
+      1>;
+  using accum_t = typename IteratorC::Element;
+  using WarpShape = WarpShape_;
+  using ThreadblockShape = ThreadblockShape_;
+  using FragmentC = typename IteratorC::Fragment;
+  using lse_scalar_t = float;
+
+  // Storage in shared-memory for Q.Kt
+  using AccumulatorSharedStorage =
+      cutlass::gemm::threadblock::AccumulatorSharedStorage<
+          ThreadblockShape,
+          scalar_t,
+          cutlass::layout::ColumnMajor,
+          cutlass::MatrixShape<0, 0> // Padding
+          >;
+
+  static void CUTLASS_DEVICE accumToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      FragmentC const& accum,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    using Policy = typename IteratorC::Policy;
+    using Element = typename IteratorC::Element;
+    using Iterations = typename IteratorC::Iterations;
+    using Delta = typename IteratorC::Delta;
+
+    auto ref_ = shared_storage.accum_ref();
+    // ctor - MmaSimtTileIterator
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id) *
+        MatrixCoord(Policy::LaneMmaShape::kM, Policy::LaneMmaShape::kN);
+
+    ref_.add_coord_offset(lane_offset);
+
+    // Tile offset
+    ref_.add_coord_offset(
+        tile_coords *
+        cutlass::MatrixCoord(
+            {IteratorC::Shape::kRow, IteratorC::Shape::kColumn}));
+
+    // store - MmaSimtTileIterator
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) {
+            int r =
+                Policy::LaneMmaShape::kM * (mma_m * Policy::WarpShape::kRow) +
+                m;
+            int c = mma_n * Delta::kColumn + n;
+            int idx = n +
+                Policy::LaneMmaShape::kN *
+                    (mma_n +
+                     Iterations::kColumn *
+                         (m + mma_m * Policy::LaneMmaShape::kM));
+            ref_.at({r, c}) = scalar_t(accum[idx]);
+          }
+        }
+      }
+    }
+  }
+
+  static void CUTLASS_DEVICE accumApplyLSEToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      typename IteratorC::Fragment& accum,
+      lse_scalar_t const* lse,
+      int lse_extent,
+      int thread_id,
+      int warp_id,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    // Non-optimized way to apply LSE to registers
+    // NOTE: accum is attn.T
+    // TODO: Optimize for each architecture
+    static constexpr int WarpSize = 32;
+    using AccumLambdaIterator =
+        typename DefaultMmaAccumLambdaIterator<IteratorC, accum_t, WarpSize>::
+            Iterator;
+    auto lane_offset =
+        AccumLambdaIterator::get_lane_offset(lane_id, warp_id, tile_coords);
+
+    cutlass::Array<lse_scalar_t, IteratorC::Fragment::kElements> lse_prefetched;
+    lse_prefetched.clear();
+    int rowIdx = 0;
+    int colIdx = 0;
+    AccumLambdaIterator::iterateRows(
+        lane_offset,
+        [&](int accum_m) {
+          ++rowIdx;
+          colIdx = 0;
+        },
+        [&](int accum_m, int accum_n, int idx) {
+          if (rowIdx == 1) {
+            lse_prefetched[colIdx] = accum_n < lse_extent
+                ? lse[accum_n]
+                : cutlass::platform::numeric_limits<accum_t>::infinity();
+          }
+          accum[idx] = expf(accum[idx] - lse_prefetched[colIdx]);
+          ++colIdx;
+        },
+        [&](int accum_m) {});
+    accumToSmem(shared_storage, accum, lane_id, tile_coords);
+  }
+};
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/gemm_kernel_utils.h b/third_party/fused_multi_head_attention/gemm_kernel_utils.h
new file mode 100644
index 0000000000..3703257a17
--- /dev/null
+++ b/third_party/fused_multi_head_attention/gemm_kernel_utils.h
@@ -0,0 +1,258 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/arch/mma.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// Some helper functions
+////////////////////////////////////////////////////////////////////////////////
+#define DISPATCH_TYPES(tensor, func)                                           \
+  {                                                                            \
+    if (query.scalar_type() == at::ScalarType::Float) {                        \
+      using scalar_t = float;                                                  \
+      func();                                                                  \
+    } else if (query.scalar_type() == at::ScalarType::Half) {                  \
+      using scalar_t = cutlass::half_t;                                        \
+      func();                                                                  \
+    } else if (query.scalar_type() == at::ScalarType::BFloat16) {              \
+      using scalar_t = cutlass::bfloat16_t;                                    \
+      func();                                                                  \
+    } else {                                                                   \
+      XFORMERS_CHECK(false, "Only fp32, half & bf16 supported at the moment"); \
+    }                                                                          \
+  }
+
+#define DISPATCH_BOOL(BOOL_V, BOOL_NAME, F) \
+  {                                         \
+    if (BOOL_V) {                           \
+      using BOOL_NAME = std::true_type;      \
+      F();                                  \
+    } else {                                \
+      using BOOL_NAME = std::false_type;      \
+      F();                                  \
+    }                                       \
+  }
+
+#define DISPATCH_ARCHTAG(CC, func)                                        \
+  {                                                                       \
+    if (CC >= 80) {                                                       \
+      using ArchTag = cutlass::arch::Sm80;                                \
+      func();                                                             \
+    } else if (CC >= 75) {                                                \
+      using ArchTag = cutlass::arch::Sm75;                                \
+      func();                                                             \
+    } else if (CC >= 70) {                                                \
+      using ArchTag = cutlass::arch::Sm70;                                \
+      func();                                                             \
+    } else if (CC >= 50) {                                                \
+      using ArchTag = cutlass::arch::Sm50;                                \
+      func();                                                             \
+    } else {                                                              \
+      XFORMERS_CHECK(                                                     \
+          false,                                                          \
+          "Your device is too old. We require compute capability >= 50"); \
+    }                                                                     \
+  }
+
+#define CHECK_NOSPARSE_CONTIGUOUS_CUDA(TENSOR)                            \
+  XFORMERS_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
+  XFORMERS_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
+  XFORMERS_CHECK(TENSOR.is_contiguous());
+
+#define CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(TENSOR)                        \
+  XFORMERS_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
+  XFORMERS_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
+  XFORMERS_CHECK(                                                         \
+      TENSOR.stride(-1) == 1, #TENSOR ": last dimension must be contiguous");
+
+#ifdef TORCH_CHECK
+#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT) \
+  XFORMERS_CHECK(                         \
+      uint64_t(PTR) % ALIGNMENT == 0, #PTR " is not correctly aligned")
+#define XFORMERS_CHECK TORCH_CHECK
+#elif defined(__CUDACC_RTC__)
+#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT)  \
+  if (!(uint64_t(PTR) % ALIGNMENT == 0)) { \
+    return false;                          \
+  }
+#define XFORMERS_CHECK(COND, ERR) \
+  if (!(COND)) {                  \
+    return false;                 \
+  }
+#else
+#include <iostream>
+#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT)            \
+  if (!(uint64_t(PTR) % ALIGNMENT == 0)) {           \
+    std::cerr << #PTR " is not correctly aligned\n"; \
+    return false;                                    \
+  }
+#define XFORMERS_CHECK(COND, ERR)                       \
+  if (!(COND)) {                                        \
+    std::cerr << "'" #COND "' failed: " << ERR << "\n"; \
+    return false;                                       \
+  }
+#endif
+
+#define ASSIGN_CHECK_OVERFLOW(A, B)                                    \
+  {                                                                    \
+    A = B;                                                             \
+    XFORMERS_CHECK(                                                    \
+        B < std::numeric_limits<decltype(A)>::max(), #B " overflows"); \
+  }
+
+namespace gemm_kernel_utils {
+
+template <typename integer>
+constexpr CUTLASS_HOST_DEVICE integer ceil_div(integer n, integer m) {
+  return (n + m - 1) / m;
+}
+
+template <typename integer>
+constexpr CUTLASS_HOST_DEVICE integer align_up(integer n, integer m) {
+  return ((n + m - 1) / m) * m;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Determine the type of GEMM we do (TensorCores or not, Shapes ...)
+// TODO: Maybe we could rely on Cutlass's DefaultGemm templates
+////////////////////////////////////////////////////////////////////////////////
+
+// Fallback to Simt (FMA on cuda cores) if not in a special case below
+template <typename ArchTag, typename scalar_t_, typename Enable = void>
+struct DefaultGemmType {
+  static constexpr int ThreadK = 8;
+  static constexpr int WarpK = 8;
+  static constexpr int kMinimumAlignment = 1;
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  using OpClass = cutlass::arch::OpClassSimt;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Specialization for tensorcores with f32
+template <typename ArchTag>
+struct DefaultGemmType<
+    ArchTag,
+    float,
+    typename cutlass::platform::enable_if<
+        ArchTag::kMinComputeCapability >= 80>::type> {
+  static constexpr int ThreadK = 32;
+  static constexpr int WarpK = 32;
+  static constexpr int kMinimumAlignment = 4;
+  using OpClass = cutlass::arch::OpClassTensorOp;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Operator = cutlass::arch::OpMultiplyAddFastF32;
+};
+
+// Specialization for tensorcores with f16/bf16 - Sm75+
+template <typename ArchTag, typename scalar_t>
+struct DefaultGemmType<
+    ArchTag,
+    scalar_t,
+    typename cutlass::platform::enable_if<
+        ArchTag::kMinComputeCapability >= 75 &&
+        cutlass::sizeof_bits<scalar_t>::value == 16>::type> {
+  static constexpr int ThreadK = 32;
+  static constexpr int WarpK = 32;
+  static constexpr int kMinimumAlignment = 4;
+  using OpClass = cutlass::arch::OpClassTensorOp;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Specialization for tensorcores with f16 - Volta
+template <>
+struct DefaultGemmType<cutlass::arch::Sm70, cutlass::half_t, void> {
+  static constexpr int ThreadK = 32;
+  static constexpr int WarpK = 32;
+  static constexpr int kMinimumAlignment = 2;
+  using OpClass = cutlass::arch::OpClassTensorOp;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Enables to do
+// `auto x = kCondition ? fa(arg) : fb(arg)`
+// when `fa` and `fb` have different types
+template <bool kVal, typename TA, typename TB>
+struct call_conditional;
+
+template <typename TA, typename TB>
+struct call_conditional<true, TA, TB> {
+  template <typename Arg>
+  static CUTLASS_HOST_DEVICE auto apply(TA ta, TB tb, Arg arg)
+      -> decltype(ta(arg)) {
+    return ta(arg);
+  }
+};
+
+template <typename TA, typename TB>
+struct call_conditional<false, TA, TB> {
+  template <typename Arg>
+  static CUTLASS_HOST_DEVICE auto apply(TA ta, TB tb, Arg arg)
+      -> decltype(tb(arg)) {
+    return tb(arg);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Mark a variable as warp-uniform - enables some compiler optimizations
+// The cheapest way to do it is just to broadcast it from lane 0
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+CUTLASS_DEVICE T warp_uniform(T value) {
+  struct {
+    union {
+      T value;
+      uint32_t asInt;
+    };
+  } p;
+  p.value = value;
+  p.asInt = __shfl_sync(0xffffffff, (unsigned)p.asInt, 0);
+  return p.value;
+}
+
+template <typename T>
+CUTLASS_DEVICE T* warp_uniform(T* ptr) {
+  struct {
+    union {
+      T* ptr;
+      uint32_t asInt[2];
+    };
+  } p;
+  p.ptr = ptr;
+  p.asInt[0] = warp_uniform(p.asInt[0]);
+  p.asInt[1] = warp_uniform(p.asInt[1]);
+  return p.ptr;
+}
+} // namespace gemm_kernel_utils
diff --git a/third_party/fused_multi_head_attention/iterators/default_warp_iterator_from_smem.h b/third_party/fused_multi_head_attention/iterators/default_warp_iterator_from_smem.h
new file mode 100644
index 0000000000..930ee46dfe
--- /dev/null
+++ b/third_party/fused_multi_head_attention/iterators/default_warp_iterator_from_smem.h
@@ -0,0 +1,142 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Instantiates the right WarpIterator to read from shared memory
+    The class `DefaultWarpIteratorAFromSharedMemory` is useful when reading
+        data dumped with `B2bGemm::accumToSmem`.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h"
+#include "cutlass/platform/platform.h"
+
+#include "warp_iterator_from_smem.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+template <
+    typename WarpShape,
+    typename InstructionShape,
+    typename RegularWarpIterator,
+    typename Policy,
+    typename Enable = void>
+struct DefaultWarpIteratorAFromSharedMemory {};
+
+// TensorOp - Ampere half
+template <typename RegularWarpIterator, typename Policy, int kInstrK>
+struct DefaultWarpIteratorAFromSharedMemory<
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, kInstrK>,
+    RegularWarpIterator,
+    Policy,
+    typename platform::enable_if<(
+        sizeof_bits<typename RegularWarpIterator::Element>::value == 16 &&
+        Policy::Operator::Policy::OpDelta::kRow == 1)>::type> {
+  using OpDelta = typename Policy::Operator::Policy::OpDelta;
+  using WarpShape = cutlass::MatrixShape<32, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, kInstrK>;
+
+  using WarpIterator = cutlass::gemm::warp::WarpIteratorFromSmem<
+      cutlass::gemm::Operand::kA,
+      typename RegularWarpIterator::Element,
+      cutlass::MatrixShape<InstructionShape::kM, InstructionShape::kK>>;
+};
+
+// TensorOp - Ampere f32
+template <typename WarpShape, typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<
+    WarpShape,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    RegularWarpIterator,
+    Policy,
+    typename platform::enable_if<(
+        sizeof_bits<typename RegularWarpIterator::Element>::value != 16 ||
+        Policy::Operator::Policy::OpDelta::kRow != 1)>::type> {
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  static constexpr auto kWarpSize = 32;
+  using OpDelta = typename Policy::Operator::Policy::OpDelta;
+
+  using WarpIterator =
+      cutlass::gemm::warp::MmaTensorOpMultiplicandTileAccessIterator<
+          cutlass::MatrixShape<WarpShape::kM, WarpShape::kK>,
+          cutlass::gemm::Operand::kA,
+          typename RegularWarpIterator::Element,
+          cutlass::layout::RowMajor,
+          cutlass::MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+          OpDelta::kRow,
+          kWarpSize>;
+};
+
+// TensorOp - Volta
+template <typename WarpShape, typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<
+    WarpShape,
+    cutlass::gemm::GemmShape<16, 16, 4>,
+    RegularWarpIterator,
+    Policy> {
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 4>;
+  static constexpr auto kWarpSize = 32;
+  using OpDelta = typename Policy::Operator::Policy::OpDelta;
+
+  using WarpIterator =
+      cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator<
+          cutlass::MatrixShape<32, 32>, // MatrixShape<WarpShape::kM,
+                                        // WarpShape::kK>,
+          cutlass::gemm::Operand::kA,
+          typename RegularWarpIterator::Element,
+          cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<16, 32>,
+          cutlass::MatrixShape<16, 4>,
+          OpDelta::kRow,
+          kWarpSize>;
+};
+
+// Simt
+template <typename WarpShape, typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<
+    WarpShape,
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    RegularWarpIterator,
+    Policy> {
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  static constexpr auto kWarpSize = 32;
+
+  // We just use the same iterator, as we reproduced the same shared-memory
+  // schema. Just modify it to handle non-complete tiles.
+  using WarpIterator = RegularWarpIterator;
+};
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/third_party/fused_multi_head_attention/iterators/epilogue_predicated_tile_iterator.h b/third_party/fused_multi_head_attention/iterators/epilogue_predicated_tile_iterator.h
new file mode 100644
index 0000000000..7a52e96a36
--- /dev/null
+++ b/third_party/fused_multi_head_attention/iterators/epilogue_predicated_tile_iterator.h
@@ -0,0 +1,751 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue iterator that supports prefetching
+
+  Mostly copied from "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+*/
+
+#pragma once
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load and store output tile from global memory in
+/// epilogue.
+///
+/// Satisfies: ReadableTileIterator | PredicatedTileIterator |
+/// ForwardTileIterator
+///
+template <
+    typename ThreadMap_, ///< Thread map (conept: OutputTileThreadMap)
+    typename Element_, ///< Element data type
+    bool ScatterD = false, ///< Scatter D operand or not
+    bool UseCUDAStore = false>
+class PredicatedTileIteratorPrefetch {
+ public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kThreads = ThreadMap::kThreads;
+  static int const kIterations = ThreadMap::Count::kTile;
+
+  static_assert(
+      ThreadMap::Iterations::kRow > 0,
+      "ThreadMap::Iterations::kRow must be > 0");
+  static_assert(
+      ThreadMap::Iterations::kGroup > 0,
+      "ThreadMap::Iterations::kGroup must be > 0");
+  static_assert(
+      ThreadMap::Iterations::kCluster > 0,
+      "ThreadMap::Iterations::kCluster must be > 0");
+  static_assert(
+      ThreadMap::Iterations::kColumn > 0,
+      "ThreadMap::Iterations::kColumn must be > 0");
+
+  /// Fragment object
+  using Fragment = Array<
+      Element,
+      ThreadMap::Iterations::kColumn * ThreadMap::Iterations::kRow *
+          ThreadMap::Iterations::kGroup * ThreadMap::Iterations::kCluster *
+          ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+
+  //
+  // Parameters struct
+  //
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileIteratorParams {
+    using Base = PredicatedTileIteratorParams;
+
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : PredicatedTileIteratorParams(
+              layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
+              make_OutputTileThreadMapDesc<ThreadMap>()) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const& base) : Base(base) {}
+  };
+
+  /// Mask object
+  struct Mask {
+    static int const kCount = ThreadMap::Iterations::kColumn;
+
+    /// Predicate state
+    bool predicates[kCount];
+
+    //
+    // Mask
+    //
+    CUTLASS_HOST_DEVICE
+    Mask() {
+      enable();
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_HOST_DEVICE void clear() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = false;
+      }
+    }
+
+    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = true;
+      }
+    }
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Parameters structure containing reference and precomputed state.
+  PredicatedTileIteratorParams params_;
+
+  /// Byte-level pointer
+  uint8_t* byte_pointer_;
+
+  /// Array of boolean values to contain steady-state predicates
+  Mask mask_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_row_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_column_;
+
+  /// A thread's starting row position (assuming steady-state predicates have
+  /// been computed)
+  Index thread_start_row_;
+
+  /// A thread's starting column
+  Index thread_start_column_;
+
+  /// Internal state counter
+  int state_[3];
+
+  /// Scatter indices
+  int const* indices_;
+
+  //
+  // Static asserts about internal strides
+  //
+
+  static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
+  static_assert(
+      sizeof(PredicatedTileIteratorParams::stride) == 8,
+      "Expected 64b strides");
+
+ private:
+  //
+  // Methods
+  //
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  PredicatedTileIteratorPrefetch(
+      PredicatedTileIteratorParams const& params,
+      Element* pointer,
+      TensorCoord extent,
+      int thread_idx,
+      TensorCoord threadblock_offset = TensorCoord(),
+      int const* indices = nullptr)
+      : params_(params), indices_(indices) {
+    TensorCoord thread_offset =
+        ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+
+    extent_row_ = extent.row();
+    extent_column_ = extent.column();
+
+    thread_start_row_ = thread_offset.row();
+    thread_start_column_ = thread_offset.column();
+
+    // Initialize predicates
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
+      mask_.predicates[c] =
+          ((thread_offset.column() + ThreadMap::Delta::kColumn * c) <
+           extent.column());
+    }
+
+    // Null pointer performs no accesses
+    if (!pointer) {
+      mask_.clear();
+    }
+
+    if (ScatterD && !indices) {
+      mask_.clear();
+    }
+
+    // Initialize pointer
+    byte_pointer_ = reinterpret_cast<uint8_t*>(pointer) +
+        LongIndex(thread_offset.row()) * LongIndex(params_.stride) +
+        LongIndex(thread_offset.column()) * sizeof(AccessType) /
+            kElementsPerAccess;
+
+    if (ScatterD) {
+      byte_pointer_ = reinterpret_cast<uint8_t*>(pointer) +
+          LongIndex(thread_offset.column()) * sizeof(AccessType) /
+              kElementsPerAccess;
+    }
+
+    // Initialize internal state counter
+    state_[0] = state_[1] = state_[2] = 0;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_DEVICE
+  void prefetch_all() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int iter = 0; iter < kIterations; ++iter) {
+      prefetch();
+      ++(*this);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void prefetch() {
+    uint8_t* byte_pointer = byte_pointer_;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            // on windows using unsigned long here gives the error
+            // error: asm operand type size(4) does not match
+            // type/size implied by constraint 'l'
+            uint64_t addr = (uint64_t)((void*)&memory_pointer
+                                           [column * ThreadMap::Delta::kColumn /
+                                            kElementsPerAccess]);
+            asm volatile("prefetch.global.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            if (!ScatterD) {
+              byte_pointer += params_.increment_row;
+            }
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, int64_t byte_offset) const {
+    uint8_t* byte_pointer = byte_pointer_;
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          if (ScatterD && row_guard) {
+            assert(indices_);
+
+            memory_pointer = reinterpret_cast<AccessType*>(
+                byte_pointer + byte_offset +
+                LongIndex(indices_[row_offset + thread_start_row_]) *
+                    LongIndex(params_.stride));
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+                frag_ptr
+                    [frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                (void*)&memory_pointer
+                    [column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            if (!ScatterD) {
+              byte_pointer += params_.increment_row;
+            }
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) const {
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, int64_t byte_offset) const {
+    uint8_t* byte_pointer = byte_pointer_;
+    AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          if (ScatterD && row_guard) {
+            assert(indices_);
+
+            memory_pointer = reinterpret_cast<AccessType*>(
+                byte_pointer + byte_offset +
+                LongIndex(indices_[row_offset + thread_start_row_]) *
+                    LongIndex(params_.stride));
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            if (UseCUDAStore) {
+              if (guard) {
+                memory_pointer
+                    [column * ThreadMap::Delta::kColumn / kElementsPerAccess] =
+                        frag_ptr
+                            [frag_row_idx * ThreadMap::Iterations::kColumn +
+                             column];
+              }
+            } else {
+              cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+                  frag_ptr
+                      [frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                  (void*)&memory_pointer
+                      [column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                  guard);
+            }
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            if (!ScatterD) {
+              byte_pointer += params_.increment_row;
+            }
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) const {
+    store_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void downsample_load_with_byte_offset(
+      Fragment& frag,
+      int64_t byte_offset,
+      int convolution_P,
+      int convolution_Q,
+      int add_P,
+      int add_Q,
+      int problem_N) const {
+    uint8_t* byte_pointer = byte_pointer_;
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          int output_row = row_offset + thread_start_row_;
+          int output_N = output_row / (convolution_P * convolution_Q);
+          int output_PQ = output_row % (convolution_P * convolution_Q);
+          int output_P = output_PQ / convolution_Q;
+          int output_Q = output_PQ % convolution_Q;
+
+          int input_row = output_N * 2 * convolution_P * 2 * convolution_Q +
+              (2 * output_P + add_P) * 2 * convolution_Q + 2 * output_Q + add_Q;
+
+          int64_t byte_offset =
+              (input_row - output_row) * problem_N * sizeof(float);
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+                frag_ptr
+                    [frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                (void*)&memory_pointer
+                    [column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void upsample_load_with_byte_offset(
+      Fragment& frag,
+      int64_t byte_offset,
+      int convolution_P,
+      int convolution_Q,
+      int add_P,
+      int add_Q,
+      int problem_N) const {
+    uint8_t* byte_pointer = byte_pointer_;
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          int output_row = row_offset + thread_start_row_;
+          int output_N = output_row / (convolution_P * convolution_Q);
+          int output_PQ = output_row % (convolution_P * convolution_Q);
+          int output_P = output_PQ / convolution_Q;
+          int output_Q = output_PQ % convolution_Q;
+          int row_add_P = add_P;
+          int row_add_Q = add_Q;
+          if (output_P > convolution_P - 2)
+            row_add_P = 0;
+          if (output_Q > convolution_Q - 2)
+            row_add_Q = 0;
+
+          int input_row = output_N * (convolution_P / 2) * (convolution_Q / 2) +
+              ((output_P + row_add_P) / 2) * (convolution_Q / 2) +
+              (output_Q + row_add_Q) / 2;
+
+          int64_t byte_offset =
+              (input_row - output_row) * problem_N * sizeof(float);
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+                frag_ptr
+                    [frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                (void*)&memory_pointer
+                    [column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  MatrixCoord thread_start() const {
+    return MatrixCoord(thread_start_row_, thread_start_column_);
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_row() const {
+    return thread_start_row_;
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_column() const {
+    return thread_start_column_;
+  }
+
+  /// Extent of the matrix in rows
+  CUTLASS_DEVICE
+  Index extent_row() const {
+    return extent_row_;
+  }
+
+  /// Extent of the matrix in columns
+  CUTLASS_DEVICE
+  Index extent_column() const {
+    return extent_column_;
+  }
+
+  /// Advances to the next position to load or store
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorPrefetch& operator++() {
+    ++state_[0];
+
+    if (!ScatterD) {
+      byte_pointer_ += params_.advance_row;
+    }
+
+    thread_start_row_ += ThreadMap::Shape::kRow;
+
+    if (state_[0] == ThreadMap::Count::kRow) {
+      state_[0] = 0;
+      ++state_[1];
+      byte_pointer_ += params_.advance_group;
+
+      thread_start_row_ += (ThreadMap::Shape::kGroup - 1) *
+          ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
+
+      if (state_[1] == ThreadMap::Count::kGroup) {
+        state_[1] = 0;
+        ++state_[2];
+        byte_pointer_ += params_.advance_cluster;
+
+        thread_start_row_ += ThreadMap::Count::kGroup *
+            ThreadMap::Shape::kGroup * ThreadMap::Count::kRow *
+            ThreadMap::Shape::kRow;
+
+        if (state_[2] == ThreadMap::Count::kCluster) {
+          state_[2] = 0;
+          byte_pointer_ += params_.advance_tile;
+        }
+      }
+    }
+
+    return *this;
+  }
+
+  ///< Efficiently disables all accesses guarded by mask
+  CUTLASS_DEVICE void clear_mask() {
+    mask_.clear();
+  }
+
+  ///< Efficiently enables all accesses guarded by mask
+  CUTLASS_DEVICE void enable_mask() {
+    mask_.enable();
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void get_mask(Mask& mask) const {
+    mask = mask_;
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void set_mask(Mask const& mask) {
+    mask_ = mask;
+  }
+};
+
+template <typename IT>
+struct MakePrefetchableIterator {
+  using Iterator = PredicatedTileIteratorPrefetch<
+      typename IT::ThreadMap,
+      typename IT::Element>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/iterators/make_residual_last.h b/third_party/fused_multi_head_attention/iterators/make_residual_last.h
new file mode 100644
index 0000000000..a667d67527
--- /dev/null
+++ b/third_party/fused_multi_head_attention/iterators/make_residual_last.h
@@ -0,0 +1,97 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "predicated_tile_access_iterator_residual_last.h"
+#include "predicated_tile_iterator_residual_last.h"
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+template <typename BaseIterator>
+struct MakeIteratorResidualLast;
+
+template <
+    typename Shape,
+    typename Element,
+    typename Layout,
+    int AdvanceRank,
+    typename ThreadMap,
+    int AccessSize,
+    bool Gather>
+struct MakeIteratorResidualLast<PredicatedTileIterator<
+    Shape,
+    Element,
+    Layout,
+    AdvanceRank,
+    ThreadMap,
+    AccessSize,
+    Gather>> {
+  using Iterator = PredicatedTileIteratorResidualLast<
+      Shape,
+      Element,
+      Layout,
+      AdvanceRank,
+      ThreadMap,
+      AccessSize,
+      Gather>;
+};
+
+template <
+    typename Shape,
+    typename Element,
+    typename Layout,
+    int AdvanceRank,
+    typename ThreadMap,
+    typename AccessType,
+    bool Gather>
+struct MakeIteratorResidualLast<PredicatedTileAccessIterator<
+    Shape,
+    Element,
+    Layout,
+    AdvanceRank,
+    ThreadMap,
+    AccessType,
+    Gather>> {
+  using Iterator = PredicatedTileAccessIteratorResidualLast<
+      Shape,
+      Element,
+      Layout,
+      AdvanceRank,
+      ThreadMap,
+      AccessType,
+      Gather>;
+};
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
diff --git a/third_party/fused_multi_head_attention/iterators/predicated_tile_access_iterator_residual_last.h b/third_party/fused_multi_head_attention/iterators/predicated_tile_access_iterator_residual_last.h
new file mode 100644
index 0000000000..d007f0445b
--- /dev/null
+++ b/third_party/fused_multi_head_attention/iterators/predicated_tile_access_iterator_residual_last.h
@@ -0,0 +1,2114 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates calculating the address and predicates to the load of tiles
+    from pitch-linear rank=2 tensors.
+
+    This iterator uses masks to guard out-of-bounds accesses. The first tile
+   this iterator visits maybe partial, then the remaining tiles are complete.
+   So, we only need to compute the predicates twice, once before the first tile
+   and once for the remaining full tiles which can share the same predicates.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+    stored in registers, and integer addition is used to advance the pointer
+    through memory.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileAccessIteratorResidualLast
+///
+template <
+    typename Shape,
+    typename Element,
+    typename Layout,
+    int AdvanceRank,
+    typename ThreadMap,
+    typename AccessType,
+    bool Gather = false>
+class PredicatedTileAccessIteratorResidualLast;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for pitch-linear
+/// data.
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    bool Gather>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::PitchLinear,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingPredicates = PredicatedTileAccessIteratorPredicates<
+      Shape,
+      Element,
+      Layout,
+      AdvanceRank,
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector =
+      ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static_assert(
+      !(ThreadMap::kElementsPerAccess % AccessType::kElements),
+      "Vectors implied by the thread map must be divisible by the access type.");
+
+  using Mask = typename UnderlyingPredicates::Mask;
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileAccessIteratorParams {
+    using Base = PredicatedTileAccessIteratorParams;
+
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : Base(
+              layout.stride(0),
+              MakePredicatedTileAccessIteratorDesc<
+                  Shape,
+                  Element,
+                  Layout,
+                  kAdvanceRank,
+                  ThreadMap>()()) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const& base) : Base(base) {}
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char*;
+
+ private:
+  //
+  // Data members
+  //
+
+  UnderlyingPredicates the_predicates;
+  Mask residual_tile_mask;
+
+  /// Parameters object with precomputed internal state
+  Params params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  /// Below is used when Gather is turned on.  We need to record strided_offset
+  /// and contiguous_offset separated to compute the offset by using
+  ///
+  /// offset = contiguous_offset + indices[strided_offset]
+  ///
+
+  /// Gather indices
+  int const* indices_;
+
+  Index gather_offset_strided;
+
+ private:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_DEVICE
+  void compute_predicates_(
+      /// Extent of the matrix window
+      TensorCoord extent,
+      /// optionally, simplify predicate calculation during 'steady state' phase
+      bool is_steady_state = false) {
+    the_predicates.compute_predicates_(extent, is_steady_state);
+  }
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      /// Gather indices
+      int const* indices = nullptr)
+      : params_(params),
+        pointer_(reinterpret_cast<BytePointer>(
+            const_cast<NonConstPointer>(pointer))),
+        the_predicates(extent),
+        indices_(indices) {
+    the_predicates.set_predicates(thread_id, threadblock_offset);
+    the_predicates.get_mask(residual_tile_mask);
+
+    // Working around a weird compiler bug happening on P100 for the backward.
+    // I've seen together: the_predicates.predicates_[0] = 14 (instead of 15)
+    // residual_tile_mask[0] = 15 (correct)
+    //
+    // Adding prints when the value is calculated (in `compute_predicates_`)
+    // sometimes removes the bug. The consequence is that we skip some
+    // element of a tensor, leading to wrong results
+    // Setting `compute_predicates_`'s second argument (`is_steady_state`) to
+    // true also seems to get rid of the bug - at the cost of twice as many
+    // comparisons.
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
+    constexpr bool kWorkAroundCompilerBug = false;
+#else
+    constexpr bool kWorkAroundCompilerBug = true;
+#endif
+    the_predicates.compute_predicates_(extent, true && !kWorkAroundCompilerBug);
+
+    // update internal pointers
+    Layout layout(params_.stride_);
+
+    if (!Gather) {
+      add_pointer_offset(layout(the_predicates.thread_offset_));
+    } else {
+      gather_offset_strided = the_predicates.thread_offset_.strided();
+      add_pointer_offset(
+          layout(make_Coord(the_predicates.thread_offset_.contiguous(), 0)));
+    }
+  }
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    the_predicates.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool is_residual_tile) {
+    if (is_residual_tile) {
+      the_predicates.set_mask(residual_tile_mask);
+    }
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    if (!Gather) {
+      if (kAdvanceRank) {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided());
+        pointer_ += Shape::kContiguous * tile_offset.contiguous();
+      } else {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous());
+        pointer_ += Shape::kStrided * tile_offset.strided();
+      }
+    } else {
+      add_pointer_offset(Shape::kContiguous * tile_offset.contiguous());
+      gather_offset_strided += Shape::kStrided * tile_offset.strided();
+    }
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    if (Gather) {
+      assert(indices_);
+
+      if (!valid()) {
+        return nullptr;
+      }
+
+      LongIndex contiguous_offset = the_predicates.iteration_contiguous_ *
+              (ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value /
+               8) +
+          the_predicates.iteration_vector_;
+      int strided_index = gather_offset_strided +
+          the_predicates.iteration_strided_ * ThreadMap::Delta::kStrided;
+
+      LongIndex strided_offset = indices_[strided_index] *
+          LongIndex(params_.stride_) * sizeof_bits<Element>::value / 8;
+
+      return reinterpret_cast<AccessType*>(
+          pointer_ + contiguous_offset + strided_offset);
+    }
+
+    return reinterpret_cast<AccessType*>(
+               pointer_ +
+               the_predicates.iteration_contiguous_ *
+                   (ThreadMap::Delta::kContiguous *
+                    sizeof_bits<Element>::value) /
+                   8) +
+        the_predicates.iteration_vector_;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    the_predicates.operator++();
+
+    ++the_predicates.iteration_vector_;
+    if (the_predicates.iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+
+    the_predicates.iteration_vector_ = 0;
+    ++the_predicates.iteration_contiguous_;
+
+    if (the_predicates.iteration_contiguous_ <
+        ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    the_predicates.iteration_contiguous_ = 0;
+    ++the_predicates.iteration_strided_;
+
+    if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      if (!Gather) {
+        pointer_ += params_.inc_strided_;
+      }
+
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    the_predicates.iteration_strided_ = 0;
+
+    if (!Gather) {
+      // advance to next tile
+      pointer_ += params_.inc_next_;
+
+      // now return to start tile - if the iterator is subsequently advanced,
+      // this subtraction as well as the subsequent integer addition are both
+      // elided by the compiler.
+      pointer_ -= params_.inc_advance_;
+    }
+
+    return *this;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    the_predicates.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    the_predicates.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    the_predicates.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    the_predicates.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    return the_predicates.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for column-major
+/// data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    bool Gather>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::ColumnMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessType,
+      Gather>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.row(), extent.column()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row(),
+                threadblock_offset.column()),
+            indices) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for row-major
+/// data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    bool Gather>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::RowMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessType,
+      Gather>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      /// Gather indices
+      int const* indices = nullptr)
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.column(), extent.row()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column(),
+                threadblock_offset.row()),
+            indices) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for affine rank 2
+/// data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRankN<2>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRankN<2>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingPredicates = PredicatedTileAccessIteratorPredicates<
+      Shape,
+      Element,
+      layout::PitchLinear,
+      AdvanceRank,
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector =
+      ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static_assert(
+      !(ThreadMap::kElementsPerAccess % AccessType::kElements),
+      "Vectors implied by the thread map must be divisible by the access type.");
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingPredicates::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+   private:
+    /// stride of pitch-linear layout (units of Element)
+    Coord<Layout::kStrideRank, Layout::LongIndex> stride_;
+    /// amount (in byte) to increment pointer to move to next access along
+    /// contiguous dimension
+    LongIndex inc_contiguous_;
+    /// amount (in byte) to increment pointer from first access of current
+    /// contiguous dimension to first access of next one.
+    LongIndex inc_strided_;
+    /// amount (in byte) to increment pointer from last access of current
+    /// contiguous dimension to first access of next one.
+    LongIndex inc_next_strided_;
+    /// amount (in byte) to increment pointer from last access to first access
+    /// of next tile
+    LongIndex inc_next_;
+    /// amount (in byte) to increment pointer from first access of current tile
+    /// to first access of next tile
+    LongIndex inc_advance_;
+
+   public:
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params()
+        : stride_(0),
+          inc_contiguous_(0),
+          inc_strided_(0),
+          inc_next_(0),
+          inc_advance_(0) {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : stride_({layout.stride(0), layout.stride(1)}) {
+      inc_contiguous_ =
+          (LongIndex(stride_[0]) * ThreadMap::Delta::kContiguous) *
+          sizeof_bits<Element>::value / 8;
+
+      inc_strided_ = (LongIndex(stride_[1]) * ThreadMap::Delta::kStrided) *
+          sizeof_bits<Element>::value / 8;
+
+      inc_next_strided_ = inc_strided_ -
+          LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_;
+
+      if (kAdvanceRank) {
+        // advance along strided dimension
+        inc_advance_ = Shape::kStrided * LongIndex(stride_[1]) *
+            sizeof_bits<Element>::value / 8;
+      } else {
+        // advance along contiguous dimension
+        inc_advance_ =
+            Shape::kContiguous * stride_[0] * sizeof_bits<Element>::value / 8;
+      }
+
+      inc_next_ = inc_advance_ -
+          LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_ -
+          LongIndex(ThreadMap::Iterations::kStrided - 1) * inc_strided_;
+    };
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char*;
+
+  //
+  // Data members
+  //
+
+  /// Parameters object with precomputed internal state
+  Params params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  UnderlyingPredicates the_predicates;
+  Mask residual_tile_mask;
+
+ private:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_DEVICE
+  void compute_predicates_(
+      /// Extent of the matrix window
+      TensorCoord extent,
+      /// optionally, simplify predicate calculation during 'steady state' phase
+      bool is_steady_state = false) {
+    the_predicates.compute_predicates_(extent, is_steady_state);
+  }
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : params_(params),
+        pointer_(reinterpret_cast<BytePointer>(
+            const_cast<NonConstPointer>(pointer))),
+        the_predicates(extent) {
+    the_predicates.set_predicates(thread_id, threadblock_offset);
+
+    // update internal pointers
+    Layout layout(params_.stride_);
+    add_pointer_offset(layout(the_predicates.thread_offset_));
+  }
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    the_predicates.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool is_residual_tile) {
+    if (is_residual_tile) {
+      the_predicates.set_mask(residual_tile_mask);
+    }
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    if (kAdvanceRank) {
+      pointer_ += params_.inc_advance_ * LongIndex(tile_offset[1]);
+      pointer_ += Shape::kContiguous * tile_offset[0];
+    } else {
+      pointer_ += params_.inc_advance_ * LongIndex(tile_offset[0]);
+      pointer_ += Shape::kStrided * tile_offset[1];
+    }
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(pointer_) +
+        the_predicates.iteration_vector_;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    the_predicates.operator++();
+    ++the_predicates.iteration_vector_;
+    if (the_predicates.iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+
+    the_predicates.iteration_vector_ = 0;
+    ++the_predicates.iteration_contiguous_;
+
+    if (the_predicates.iteration_contiguous_ <
+        ThreadMap::Iterations::kContiguous) {
+      pointer_ += params_.inc_contiguous_;
+      return *this;
+    }
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    the_predicates.iteration_contiguous_ = 0;
+    ++the_predicates.iteration_strided_;
+
+    if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      pointer_ += params_.inc_next_strided_;
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    the_predicates.iteration_strided_ = 0;
+
+    // advance to next tile
+    pointer_ += params_.inc_next_;
+
+    // now return to start tile - if the iterator is subsequently advanced, this
+    // subtraction as well as the subsequent integer addition are both elided by
+    // the compiler.
+    pointer_ -= params_.inc_advance_;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    the_predicates.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    the_predicates.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    the_predicates.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    the_predicates.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return the_predicates.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for affine rank 2
+/// column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRank2ColumnMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+      Element,
+      layout::AffineRankN<2>,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.row(), extent.column()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row(),
+                threadblock_offset.column())) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset(
+        make_Coord(tile_offset.row(), tile_offset.column()));
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for affine rank-2
+/// row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRank2RowMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+      Element,
+      layout::AffineRankN<2>,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.column(), extent.row()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column(),
+                threadblock_offset.row())) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset(
+        make_Coord(tile_offset.column(), tile_offset.row()));
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for column-major
+/// interleaved data. It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    int InterleavedK>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::ColumnMajorInterleaved<InterleavedK>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<
+          Shape::kRow * kInterleavedK,
+          Shape::kColumn / kInterleavedK>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(
+                extent.row() * kInterleavedK,
+                extent.column() / kInterleavedK),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row() * kInterleavedK,
+                threadblock_offset.column() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for row-major
+/// interleaved data.
+//  It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    int InterleavedK>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::RowMajorInterleaved<InterleavedK>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<
+          Shape::kColumn * kInterleavedK,
+          Shape::kRow / kInterleavedK>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(
+                extent.column() * kInterleavedK,
+                extent.row() / kInterleavedK),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column() * kInterleavedK,
+                threadblock_offset.row() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/iterators/predicated_tile_iterator_residual_last.h b/third_party/fused_multi_head_attention/iterators/predicated_tile_iterator_residual_last.h
new file mode 100644
index 0000000000..fa40d850c8
--- /dev/null
+++ b/third_party/fused_multi_head_attention/iterators/predicated_tile_iterator_residual_last.h
@@ -0,0 +1,2119 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of tiles from pitch-linear rank=2
+   tensors.
+
+    This iterator uses masks to guard out-of-bounds accesses. The first tile
+   this iterator visits maybe partial, then the remaining tiles are complete.
+   So, we only need to compute the predicates twice, once before the first tile
+   and once for the remaining full tiles which can share the same predicates.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+   stored in registers, and integer addition is used to advance the pointer
+   through memory.
+*/
+
+#pragma once
+
+#include "cutlass/arch/memory.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileIteratorResidualLast
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+/// Regular tile iterator using a precomputed control structure to minimize
+/// register liveness and integer arithmetic.
+///
+/// Layout is assumed to be invariant at the time the precomputed "Params"
+/// object is constructed.
+///
+/// Base pointer and tensor extents may be specified at the time the iterator is
+/// constructed. Subsequently, they are assumed to be immutable.
+///
+/// Adding a logical coordinate offset may be performed at the time the iterator
+/// is constructed. Subsequent additions to logical coordinate offset may be
+/// performed but are relatively expensive.
+///
+/// Visitation order is intended to first visit a "residual" tile that may be
+/// partially full in both the advance dimension and the steady-state dimension.
+/// This is assumed to be the last tile in the iteration sequence. Advancing an
+/// iterator that has just been constructed moves to the first tile that is full
+/// in the advance dimension and recomputes predicates. Subsequent accesses may
+/// be performed without updating internal predicates and are efficient in terms
+/// of live register state and pointer arithmetic instructions.
+///
+/// To be efficient, this assumes the iterator will be dereferenced and advanced
+/// at least once outside any looping structure to minimize integer arithmetic.
+///
+/// Accesses out of bounds are safe so long as `clear_mask()` is called prior to
+/// dereferencing the iterator.
+///
+///
+/// Example:
+///
+/// An efficient pipeline structure may be constructed as follows:
+///
+// template <typename Iterator>
+// __global__ void kernel(
+//   typename Iterator::Params params,
+//   typename Iterator::Element *ptr,
+//   TensorCoord extent) {
+//
+//   typename Iterator::Fragment fragment;
+//
+//   TensorCoord threadblock_offset(0, 0);
+//
+//   Iterator iter(params, ptr, extent, threadIdx.x, threadblock_offsets);
+//
+//
+//   fragment = *iter;        // load "residue" tile first
+//   ++iter;                  // advance to first "steady state" tile and update
+//   internal masks
+//
+//
+//   #pragma unroll
+//   for (int i = Remaining - 1; i >= 0; --i) {
+//
+//     f(fragment);
+//
+//     if (!i) {
+//       iter.clear_mask();   // light-weight operation to clear masks -
+//       subsequent loads become NO-OPs.
+//     }
+//
+//     fragment = *iter;      // load tile during "steady state" phase
+//     ++iter;                // advance to next tile - lightweight due to
+//     steady-state masks
+//   }
+// }
+//
+// void host(TensorView<Element, 2, layout::PitchLinear> view) {
+//
+//   using Iterator =
+//   transform::threadblock::PredicatedTileIteratorResidualLast;
+//
+//   typename Iterator::Params params(view.layout());
+//
+//   kernel<Iterator>(params, view.data());
+// }
+///
+///
+template <
+    typename Shape,
+    typename Element,
+    typename Layout,
+    int AdvanceRank,
+    typename ThreadMap,
+    int AccessSize = ThreadMap::kElementsPerAccess,
+    bool Gather = false>
+class PredicatedTileIteratorResidualLast;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    bool Gather>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::PitchLinear,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  /// Type used for internal memory accesses
+  using AccessType = AlignedArray<
+      Element,
+      AccessSize,
+      (AccessSize * sizeof_bits<Element>::value / 8)>;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator = PredicatedTileAccessIteratorResidualLast<
+      Shape,
+      Element,
+      Layout,
+      kAdvanceRank,
+      ThreadMap,
+      AccessType,
+      Gather>;
+
+  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename TileAccessIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    using Base = typename TileAccessIterator::Params::Base;
+
+    friend PredicatedTileIteratorResidualLast;
+
+   private:
+    /// Parameters object
+    typename TileAccessIterator::Params params_;
+
+   public:
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout) : params_(layout) {}
+
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const& base) : params_(base) {}
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char*;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      /// Gather indices
+      int const* indices = nullptr)
+      : address_iterator_(
+            params.params_,
+            pointer,
+            extent,
+            thread_id,
+            threadblock_offset,
+            indices) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    if (kAdvanceRank)
+      address_iterator_.add_tile_offset({0, 1});
+    else
+      address_iterator_.add_tile_offset({1, 0});
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    address_iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    address_iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    address_iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    address_iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    address_iterator_.get_mask(mask);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    load_with_byte_offset(
+        frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v +
+              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          address_iterator_.set_iteration_index(idx);
+          char const* byte_ptr =
+              reinterpret_cast<char const*>(address_iterator_.get()) +
+              byte_offset;
+
+          AccessType const* access_ptr =
+              reinterpret_cast<AccessType const*>(byte_ptr);
+
+          cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+              frag_ptr[idx], access_ptr, address_iterator_.valid());
+
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    store_with_byte_offset(
+        frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v +
+              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          char* byte_ptr =
+              reinterpret_cast<char*>(address_iterator_.get()) + byte_offset;
+          AccessType* access_ptr = reinterpret_cast<AccessType*>(byte_ptr);
+
+          if (address_iterator_.valid()) {
+            *access_ptr = frag_ptr[idx];
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_byte_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    bool Gather>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::ColumnMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessSize,
+      Gather>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id, ///< ID of each participating thread
+      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.row(), extent.column()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row(),
+                threadblock_offset.column()),
+            indices) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    bool Gather>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::RowMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessSize,
+      Gather>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id, ///< ID of each participating thread
+      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
+      int const* indices = nullptr ///< Gather indices
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.column(), extent.row()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column(),
+                threadblock_offset.row()),
+            indices) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for affine rank-2 data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRankN<2>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRankN<2>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  /// Type used for internal memory accesses
+  using AccessType = AlignedArray<
+      Element,
+      AccessSize,
+      (AccessSize * sizeof_bits<Element>::value / 8)>;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator = PredicatedTileAccessIteratorResidualLast<
+      Shape,
+      Element,
+      Layout,
+      kAdvanceRank,
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename TileAccessIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend PredicatedTileIteratorResidualLast;
+
+   private:
+    /// Parameters object
+    typename TileAccessIterator::Params params_;
+
+   public:
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout) : params_(layout) {}
+
+    CUTLASS_HOST_DEVICE
+    Params() {}
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char*;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : address_iterator_(
+            params.params_,
+            pointer,
+            extent,
+            thread_id,
+            threadblock_offset) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    if (kAdvanceRank)
+      address_iterator_.add_tile_offset(make_Coord(0, 1));
+    else
+      address_iterator_.add_tile_offset(make_Coord(1, 0));
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    address_iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    address_iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    address_iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    address_iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    address_iterator_.get_mask(mask);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    load_with_byte_offset(
+        frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v +
+              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          address_iterator_.set_iteration_index(idx);
+          char const* byte_ptr =
+              reinterpret_cast<char const*>(address_iterator_.get()) +
+              byte_offset;
+
+          AccessType const* access_ptr =
+              reinterpret_cast<AccessType const*>(byte_ptr);
+
+          cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+              frag_ptr[idx], access_ptr, address_iterator_.valid());
+
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    store_with_byte_offset(
+        frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v +
+              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          char* byte_ptr =
+              reinterpret_cast<char*>(address_iterator_.get()) + byte_offset;
+          AccessType* access_ptr = reinterpret_cast<AccessType*>(byte_ptr);
+
+          if (address_iterator_.valid()) {
+            *access_ptr = frag_ptr[idx];
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_byte_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for affine rank 2
+/// column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRank2ColumnMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+      Element,
+      layout::AffineRankN<2>,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessSize>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id, ///< ID of each participating thread
+      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.row(), extent.column()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row(),
+                threadblock_offset.column())) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for affine rank 2
+/// row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRank2RowMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+      Element,
+      layout::AffineRankN<2>,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessSize>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id, ///< ID of each participating thread
+      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.column(), extent.row()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column(),
+                threadblock_offset.row())) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for interleaved data.
+/// It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    int InterleavedK>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::ColumnMajorInterleaved<InterleavedK>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<
+          Shape::kRow * kInterleavedK,
+          Shape::kColumn / kInterleavedK>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessSize>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(
+                extent.row() * kInterleavedK,
+                extent.column() / kInterleavedK),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row() * kInterleavedK,
+                threadblock_offset.column() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for interleaved-32
+/// data.  It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    int InterleavedK>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::RowMajorInterleaved<InterleavedK>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<
+          Shape::kColumn * kInterleavedK,
+          Shape::kRow / kInterleavedK>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessSize>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(
+                extent.column() * kInterleavedK,
+                extent.row() / kInterleavedK),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column() * kInterleavedK,
+                threadblock_offset.row() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/iterators/transpose_warp_iterator.h b/third_party/fused_multi_head_attention/iterators/transpose_warp_iterator.h
new file mode 100644
index 0000000000..18858ab732
--- /dev/null
+++ b/third_party/fused_multi_head_attention/iterators/transpose_warp_iterator.h
@@ -0,0 +1,55 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "warp_iterator_from_smem.h"
+
+template <typename WarpIterator>
+struct TransposeWarpIterator {
+  using Iterator = char;
+  static bool constexpr kSupportsTranspose = false;
+};
+
+template <
+    /// Operand identity
+    cutlass::gemm::Operand Operand,
+    /// Data type of A elements
+    typename Element,
+    typename InstructionShape,
+    bool kTranspose>
+struct TransposeWarpIterator<
+    cutlass::gemm::warp::
+        WarpIteratorFromSmem<Operand, Element, InstructionShape, kTranspose>> {
+  using Iterator = cutlass::gemm::warp::
+      WarpIteratorFromSmem<Operand, Element, InstructionShape, !kTranspose>;
+  static bool constexpr kSupportsTranspose = true;
+};
diff --git a/third_party/fused_multi_head_attention/iterators/warp_iterator_from_smem.h b/third_party/fused_multi_head_attention/iterators/warp_iterator_from_smem.h
new file mode 100644
index 0000000000..3f4ebec698
--- /dev/null
+++ b/third_party/fused_multi_head_attention/iterators/warp_iterator_from_smem.h
@@ -0,0 +1,283 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Inspired from
+   "cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h" Loads tiles of GEMM
+   operands from a RowMajor shared-memory layout into registers to use by A100
+   TensorCores.
+
+    The difference with "mma_tensor_op_tile_access_iterator.h" is that:
+    (1) We use "ldmatrix" to load tiles, rather than manual loads (slightly
+   faster) (2) We support to transpose the operand (eg read `A.transpose()` when
+   the shared memory holds `A`)
+
+    This is only implemented for the specific shapes.
+*/
+#pragma once
+
+#include <cutlass/gemm/gemm.h>
+
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+template <
+    /// Operand identity
+    Operand Operand_,
+    /// Data type of A elements
+    typename Element_,
+    typename InstructionShape_,
+    bool kTranspose = false>
+class WarpIteratorFromSmem {
+ public:
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = cutlass::MatrixShape<32, 32>;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+  static_assert(
+      kOperand == Operand::kA,
+      "No support for OperandB at the moment");
+
+  /// Basic check
+  static_assert(
+      kOperand == Operand::kA || kOperand == Operand::kB,
+      "WarpIteratorFromSmem may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+  static_assert(sizeof_bits<Element>::value == 16, "Only supported for half");
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+  static_assert(InstructionShape::kRow == 16, "Only supports 16x8x8 / 16x8x16");
+  static_assert(
+      InstructionShape::kColumn == 8 || InstructionShape::kColumn == 16,
+      "Only supports 16x8x8 / 16x8x16");
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept:
+  /// MatrixShape)
+  static int const kOpDelta = 1;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Number of elements accessed per Shared Memory load
+  static int const kElementsPerAccess =
+      (sizeof_bits<Element>::value >= 32 ? 1
+                                         : 32 / sizeof_bits<Element>::value);
+
+  using InstructionCount = MatrixShape<
+      Shape::kRow / InstructionShape::kRow,
+      Shape::kColumn / InstructionShape::kColumn>;
+
+  static int const kIterations = (kOperand == Operand::kA)
+      ? InstructionCount::kColumn
+      : InstructionCount::kRow;
+
+ public:
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<
+      Element,
+      (kOperand == Operand::kA)
+          ? (Shape::kRow* InstructionShape::kColumn / kThreads)
+          : (Shape::kColumn* InstructionShape::kRow / kThreads)>;
+
+  /// Memory access type
+  // using AccessType = AlignedArray<Element, kElementsPerAccess>;
+  using AccessType = Array<unsigned, 4>;
+
+  static int constexpr kWarpShapeDivisibleInner =
+      (kOperand == Operand::kA ? InstructionShape::kColumn
+                               : InstructionShape::kRow);
+  static int constexpr kAccessesInner =
+      (kWarpShapeDivisibleInner / kElementsPerAccess) / 4;
+  // Number of 32bits tiles to load per `ldmatrix`
+  static int const kTilesPerInstruction = InstructionShape::kRow / 8;
+  static_assert(kTilesPerInstruction == 2, "Only supports 16x8x16 and 16x8x8");
+
+ private:
+  /// Underlying tensor reference
+  TensorRef ref_;
+
+  /// Origin
+  MatrixCoord origin_;
+
+  /// Iterations in a tile
+  int iterations_;
+
+ public:
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  WarpIteratorFromSmem(TensorRef const& ref, int lane_id)
+      : WarpIteratorFromSmem(ref, {Shape::kRow, Shape::kColumn}, lane_id) {}
+  CUTLASS_HOST_DEVICE
+  WarpIteratorFromSmem(TensorRef const& ref, TensorCoord extent, int lane_id)
+      : ref_(ref), iterations_(0) {
+    // See also:
+    // https://docs.nvidia.com/cuda/archive/11.7.1/parallel-thread-execution/index.html#warp-level-matrix-fragment-mma-1688
+    // 16x8x8: kAccessesInner = 1 (1 ldmatrix.x4)
+    // 16x8x16: kAccessesInner = 2 (2 ldmatrix.x4)
+    int ldsm_vec_num = (lane_id >> 3);
+    if (kOperand == Operand::kA) {
+      origin_ = MatrixCoord(lane_id % 8, 0);
+      static_assert(
+          InstructionCount::kRow * kTilesPerInstruction == 4,
+          "can't use ldmatrix.x4");
+      int access_m_idx = ldsm_vec_num % kTilesPerInstruction;
+      int inner_idx = (ldsm_vec_num / kTilesPerInstruction) % kAccessesInner;
+      int inst_m_idx = ldsm_vec_num / (kTilesPerInstruction * kAccessesInner);
+      MatrixCoord offset(
+          access_m_idx * 8 + inst_m_idx * InstructionShape::kRow,
+          inner_idx * 4 * kElementsPerAccess);
+      if (kTranspose) {
+        offset = MatrixCoord(offset.column(), offset.row());
+      }
+      origin_ += offset;
+    } else {
+      // Note: This is not tested or used
+      origin_ = MatrixCoord(0, lane_id % 8);
+      static_assert(InstructionCount::kColumn * kAccessesInner == 4, "");
+      CUTLASS_PRAGMA_UNROLL
+      for (int inst_n_idx = 0; inst_n_idx < InstructionCount::kColumn;
+           ++inst_n_idx) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int inner_idx = 0; inner_idx < kAccessesInner; ++inner_idx) {
+          int access_idx = inner_idx + kAccessesInner * inst_n_idx;
+
+          MatrixCoord offset(
+              inner_idx * 4 * kElementsPerAccess, inst_n_idx * 8);
+
+          if (access_idx == ldsm_vec_num) {
+            if (kTranspose) {
+              offset = MatrixCoord(offset.column(), offset.row());
+            }
+            origin_ += offset;
+          }
+        }
+      }
+    }
+
+    ref_.add_coord_offset(origin_);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  WarpIteratorFromSmem& add_tile_offset(TensorCoord const& tile_offset) {
+    TensorCoord coord_offset(
+        tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+    if (kTranspose) {
+      coord_offset = TensorCoord{coord_offset.column(), coord_offset.row()};
+    }
+    origin_ += coord_offset;
+
+    ref_.add_coord_offset(coord_offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  void advance() {
+    if (kOperand == Operand::kA) {
+      add_tile_offset({0, 1});
+    } else {
+      add_tile_offset({1, 0});
+    }
+
+    iterations_ = 0;
+  }
+
+  /// increase iterations in a tile
+  CUTLASS_HOST_DEVICE
+  WarpIteratorFromSmem& operator++() {
+    iterations_++;
+
+    if (iterations_ >= kIterations)
+      advance();
+
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_DEVICE
+  void load(Fragment& frag) const {
+    AccessType* access_ptr = reinterpret_cast<AccessType*>(&frag);
+    using LoadLayout = typename platform::
+        conditional<kTranspose, layout::ColumnMajor, layout::RowMajor>::type;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int access_m_idx = 0; access_m_idx <
+         (InstructionCount::kRow * kTilesPerInstruction * kAccessesInner) / 4;
+         ++access_m_idx) {
+      MatrixCoord offset;
+      if (kOperand == Operand::kA) {
+        offset = MatrixCoord(
+            access_m_idx * 16, iterations_ * InstructionShape::kColumn);
+      } else {
+        offset = MatrixCoord(iterations_ * InstructionShape::kRow, 0);
+      }
+      if (kTranspose) {
+        offset = MatrixCoord(offset.column(), offset.row());
+      }
+      cutlass::arch::ldsm<LoadLayout, 4>(
+          access_ptr[access_m_idx], ref_.data() + ref_.offset(offset));
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/kernel_backward.h b/third_party/fused_multi_head_attention/kernel_backward.h
new file mode 100644
index 0000000000..5cdb7c2145
--- /dev/null
+++ b/third_party/fused_multi_head_attention/kernel_backward.h
@@ -0,0 +1,2554 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cmath>
+#include <cinttypes>
+#include <type_traits>
+#include <vector>
+
+#include <cuda_fp16.h>
+#include <curand_kernel.h>
+
+#ifdef HAS_PYTORCH
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAGeneratorImpl.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAGraphsUtils.cuh>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/functional.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+
+#include "debug_utils.h"
+#include "gemm_kernel_utils.h"
+
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/threadblock/epilogue_smem_accumulator.h"
+#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/integer_subbyte.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/vector_iterator.h"
+#include "epilogue/epilogue_pipelined.h"
+#include "iterators/epilogue_predicated_tile_iterator.h"
+
+#include "gemm/custom_mma.h"
+#include "gemm/find_default_mma.h"
+#include "gemm/mma_accum_lambda_iterator.h"
+#include "gemm/mma_from_smem.h"
+#include "transform/tile_smem_loader.h"
+
+using namespace gemm_kernel_utils;
+
+namespace {
+
+template <typename FragmentType, int32_t kNumThreads>
+struct GmemTile {
+  /*
+    Helper functions to efficient store/load RF to gmem
+
+    GEMM accumulators have a particular format on A100, and
+    it takes some compute/shared-memory to rearrange them to
+    a RowMajor or ColumnMajor format in global memory through
+    an Epilogue. The same complexity goes for loading into RF.
+
+    This class loads/stores RF as they are, and can be used for
+    efficient accumulation across gemms for instance:
+
+    ```
+    GmemTile tile;
+    for (int i = 0; i < N; ++i) {
+      // ...
+
+      Fragment accum;
+      if (i == 0) {
+        accum.clear();
+      } else {
+        tile.load(accum);
+      }
+      mma(accum, ...);
+      if (i < N-1) {
+        // Store for next GEMM
+        tile.store(accum);
+      } else {
+        // Store in tensor (eg RowMajor)
+        epilogue(accum);
+      }
+
+      // ...
+    }
+    ```
+  */
+
+  // 128bits per thread
+  using AccessType = cutlass::Array<float, 4>;
+  static constexpr int32_t kBytes = sizeof(AccessType);
+  static constexpr int32_t kStride = kNumThreads * AccessType::kElements;
+  static constexpr int32_t kNumIters =
+      FragmentType::kElements / AccessType::kElements;
+  static constexpr int32_t kElementsStored =
+      kNumThreads * FragmentType::kElements;
+  static_assert(
+      FragmentType::kElements % AccessType::kElements == 0,
+      "fragment not aligned on 128 bits");
+
+  float* ptr;
+
+  CUTLASS_DEVICE void load(FragmentType& fragment, int thread_id) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kNumIters; ++i) {
+      AccessType* __restrict__ gmem_ptr = reinterpret_cast<AccessType*>(
+          ptr + thread_id * AccessType::kElements + i * kStride);
+      AccessType sub_fragment;
+      cutlass::arch::global_load<AccessType, kBytes>(
+          sub_fragment, gmem_ptr, true);
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < AccessType::kElements; ++j) {
+        fragment[i * AccessType::kElements + j] = sub_fragment[j];
+      }
+    }
+  }
+
+  CUTLASS_DEVICE void store(FragmentType const& fragment, int thread_id) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kNumIters; ++i) {
+      AccessType* __restrict__ gmem_ptr = reinterpret_cast<AccessType*>(
+          ptr + thread_id * AccessType::kElements + i * kStride);
+      AccessType sub_fragment;
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < AccessType::kElements; ++j) {
+        sub_fragment[j] = fragment[i * AccessType::kElements + j];
+      }
+      cutlass::arch::global_store<AccessType, kBytes>(
+          sub_fragment, gmem_ptr, true);
+    }
+  }
+
+  CUTLASS_DEVICE void storeAtomicAdd(
+      FragmentType const& fragment,
+      int thread_id) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kNumIters; ++i) {
+      float* gmem_ptr = ptr + thread_id * AccessType::kElements + i * kStride;
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < AccessType::kElements; ++j) {
+        float val = fragment[i * AccessType::kElements + j];
+        float* ptr = gmem_ptr + j;
+        atomicAdd(ptr, val);
+      }
+    }
+  }
+};
+
+struct AtomicLock {
+  CUTLASS_DEVICE static void acquire(
+      int32_t* lock,
+      int set_val,
+      int thread_id) {
+    if (thread_id == 0) {
+      while (atomicCAS(lock, 0 /*cmp*/, set_val /*setval*/) != set_val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+        __nanosleep(40);
+#endif
+      }
+    }
+    __syncthreads();
+  }
+  CUTLASS_DEVICE static void release(int32_t* lock, int thread_id) {
+    if (thread_id == 0) {
+      int status = 0;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+      asm volatile("st.global.release.gpu.b32 [%0], %1;\n"
+                   :
+                   : "l"(lock), "r"(status));
+#else
+      asm volatile("st.global.cg.b32 [%0], %1;\n" : : "l"(lock), "r"(status));
+#endif
+    }
+  }
+};
+
+template <typename scalar_t, typename Arch>
+constexpr int getWarpsPerSmBw() {
+  bool is_half = !cutlass::platform::is_same<scalar_t, float>::value;
+  if (Arch::kMinComputeCapability >= 80) {
+    return is_half ? 12 : 8;
+  }
+  return 8;
+}
+} // namespace
+
+template <
+    // which arch we target (eg `cutlass::arch::Sm80`)
+    typename ArchTag_,
+    // input/output type
+    typename scalar_t_,
+    // run optimized kernel because memory accesses will be aligned
+    bool kIsAligned_,
+    // use dropout if enabled
+    bool kApplyDropout_,
+    // when doing a GEMM, preload the next one (uses more shmem)
+    bool kPreload_,
+    // block dimensions
+    int kBlockSizeI_,
+    int kBlockSizeJ_,
+    // upperbound on `max(value.shape[-1], query.shape[-1])`
+    int kMaxK_ = (int)cutlass::platform::numeric_limits<uint32_t>::max(),
+    // assumes that `cu_seqlen` is None, and
+    // (1) `num_queries % kBlockSizeI == 0`
+    // (2) `num_keys % kBlockSizeJ == 0`
+    bool kKeysQueriesAlignedToBlockSize_ = false,
+    // Allows to parallelize across keys
+    bool kEnableSplitKeys_ = true>
+struct AttentionBackwardKernel {
+  enum CustomMaskType {
+    NoCustomMask = 0,
+    CausalFromTopLeft = 1,
+    CausalFromBottomRight = 2,
+    NumCustomMaskTypes,
+  };
+  using scalar_t = scalar_t_;
+  using output_t = scalar_t;
+  using output_accum_t = float;
+  using lse_scalar_t = float;
+  using accum_t = float;
+  using ArchTag = ArchTag_;
+  static constexpr bool kIsAligned = kIsAligned_;
+  static constexpr bool kApplyDropout = kApplyDropout_;
+  static constexpr bool kPreload = kPreload_;
+  static constexpr int kBlockSizeI = kBlockSizeI_;
+  static constexpr int kBlockSizeJ = kBlockSizeJ_;
+  static constexpr int kMaxK = kMaxK_;
+  static constexpr bool kKeysQueriesAlignedToBlockSize =
+      kKeysQueriesAlignedToBlockSize_;
+
+  static constexpr int64_t kWarpSize = 32;
+
+  // If this is true, we store and accumulate dK/dV in RF
+  // rather than going back to gmem everytime
+  static constexpr bool kIsHalf = cutlass::sizeof_bits<scalar_t>::value <= 16;
+  static constexpr bool kOutputInRF = kIsHalf && kMaxK <= kBlockSizeI;
+  static_assert(
+      !kPreload ||
+          (kIsHalf && ArchTag::kMinComputeCapability >= 80 && kOutputInRF),
+      "preload MMA not supported");
+  static constexpr bool kPrologueQK = kPreload;
+  static constexpr bool kPrologueGV = kPreload;
+  static constexpr bool kPrologueDOV = kPreload;
+  static constexpr bool kPrologueGQ = kPreload;
+  static constexpr bool kPrologueGK = kPreload;
+
+  static constexpr int64_t kNumWarpsPerBlock =
+      (kBlockSizeI * kBlockSizeJ) / (32 * 32);
+
+  // Compute delta for the f16 kernels
+  // TODO: Figure out why it's slower on the f32 kernels
+  // (something due to RF pressure?)
+  // TODO: Remove condition on `kOutputInRF` - this is needed to work
+  // around a compiler bug on V100, not exactly sure why but I spent
+  // too much time on this already. Reproducible with
+  // (B, Mq, Mkv, K) = (1, 1, 1, 136) for instance
+  static constexpr bool kKernelComputesDelta =
+      kIsHalf && (kOutputInRF || ArchTag::kMinComputeCapability != 70);
+
+  // Launch bounds
+  static constexpr int64_t kNumThreads = kWarpSize * kNumWarpsPerBlock;
+  static constexpr int64_t kMinBlocksPerSm =
+      getWarpsPerSmBw<scalar_t, ArchTag>() / kNumWarpsPerBlock;
+
+  using GemmType = DefaultGemmType<ArchTag, scalar_t>;
+  using DefaultConfig =
+      typename cutlass::gemm::device::DefaultGemmConfiguration<
+          typename GemmType::OpClass,
+          ArchTag,
+          scalar_t,
+          scalar_t,
+          scalar_t, // ElementC
+          accum_t // ElementAccumulator
+          >;
+  static constexpr auto kOptimalAlignement = cutlass::platform::max(
+      DefaultConfig::kAlignmentA,
+      DefaultConfig::kAlignmentB);
+  static constexpr auto kMinimumAlignment = GemmType::kMinimumAlignment;
+
+  struct MatmulQK {
+    /*
+    attn_T = k_j @ q_i.transpose(-2, -1) # matmul
+    attn_T = (attn_T - logsumexp[i_start:i_end].unsqueeze(1).transpose(-2,
+    -1)).exp() # epilogue
+
+    with attn_T.shape = (kBlockSizeJ, kBlockSizeI)
+    */
+    using ThreadblockShape =
+        cutlass::gemm::GemmShape<kBlockSizeJ, kBlockSizeI, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+    using DefaultMma = typename cutlass::gemm::threadblock::DefaultMma<
+        scalar_t, // ElementA
+        cutlass::layout::RowMajor, // LayoutA
+        kIsAligned ? DefaultConfig::kAlignmentA : GemmType::kMinimumAlignment,
+        scalar_t, // ElementB
+        cutlass::layout::ColumnMajor, // LayoutB
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment,
+        accum_t, // ElementC
+        cutlass::layout::RowMajor, // LayoutC
+        typename GemmType::OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        typename GemmType::InstructionShape,
+        DefaultConfig::kStages,
+        typename GemmType::Operator,
+        false, // AccumulatorsInRowMajor = false,
+        cutlass::gemm::SharedMemoryClearOption::kNone>;
+    using MmaCore = typename DefaultMma::MmaCore;
+    using Mma =
+        typename MakeCustomMma<typename DefaultMma::ThreadblockMma, kMaxK>::Mma;
+
+    // used for efficient load of bias tile (Bij) from global memory to shared
+    // memory
+    using BiasLoader = TileSmemLoader<
+        scalar_t,
+        // Bij is applied to transposed attn matrix tile (Pij.T). Bij is loaded
+        // row-major but needs to have transposed shape so we get the same
+        // elements.
+        cutlass::MatrixShape<ThreadblockShape::kN, ThreadblockShape::kM>,
+        MmaCore::kThreads,
+        // input restriction: kv_len has to be a multiple of this value
+        128 / cutlass::sizeof_bits<scalar_t>::value>;
+
+    // Epilogue to store to shared-memory in a format that we can use later for
+    // the second matmul
+    using B2bGemm = typename cutlass::gemm::threadblock::B2bGemm<
+        typename Mma::Operator::IteratorC,
+        typename Mma::Operator,
+        scalar_t,
+        WarpShape,
+        ThreadblockShape>;
+    using AccumLambdaIterator = typename DefaultMmaAccumLambdaIterator<
+        typename Mma::Operator::IteratorC,
+        accum_t,
+        kWarpSize>::Iterator;
+    using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage;
+  };
+
+  struct MatmulGradV {
+    /*
+    grad_v[j_start:j_end] += attn_T @ do_i # matmul
+
+    Dimensions: (kBlockSizeJ * kNumWarpsPerBlock, kBlockSizeI, K)
+    (we might need to iterate multiple times on K)
+    */
+    using ThreadblockShape =
+        cutlass::gemm::GemmShape<kBlockSizeJ, kBlockSizeI, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+    using InstructionShape = typename GemmType::InstructionShape;
+
+    using DefaultGemm = cutlass::gemm::kernel::DefaultGemm<
+        scalar_t, // ElementA,
+        cutlass::layout::RowMajor, // LayoutA,
+        DefaultConfig::kAlignmentA,
+        scalar_t, // ElementB,
+        cutlass::layout::RowMajor, // LayoutB,
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment,
+        output_t,
+        cutlass::layout::RowMajor, // LayoutC,
+        accum_t,
+        typename GemmType::OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        typename GemmType::InstructionShape,
+        typename DefaultConfig::EpilogueOutputOp,
+        void, // ThreadblockSwizzle - not used
+        DefaultConfig::kStages,
+        false, // SplitKSerial
+        typename GemmType::Operator>;
+
+    // if dropout:
+    //   for computing dVj += (Pij.T * Zij) @ dOi
+    //   Pij_dropped.T = Pij.T * Zij is computed on the fly as fragments of
+    //   Pij.T are loaded in. The reason we do it this way is because Pij.T and
+    //   Zij are reused in later steps, while Pij_dropped.T is only needed in
+    //   this step. computing Pij_dropped.T on the fly allows us to avoid
+    //   keeping all 3 of Pij_dropped.T, Pij.T, and Zij in shared memory at the
+    //   same time.
+    // if no dropout:
+    //   for computing dVj += Pij.T @ dOi
+    using WarpIteratorA = typename cutlass::gemm::threadblock::
+        DefaultWarpIteratorAFromSharedMemory<
+            typename DefaultGemm::Mma::Operator::Shape, // WarpShape
+            typename DefaultGemm::Mma::Operator::
+                InstructionShape, // InstructionShape
+            typename DefaultGemm::Mma::Operator::
+                IteratorA, // RegularWarpIterator
+            typename DefaultGemm::Mma::Policy // Policy
+            >::WarpIterator;
+    using DefaultMmaFromSmem =
+        typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            MatmulQK::AccumulatorSharedStorage::Shape::kN,
+            WarpIteratorA,
+            kApplyDropout>; // kScaleOperandA
+
+    using Mma = typename DefaultMmaFromSmem::Mma;
+    using IteratorB = typename Mma::IteratorB;
+    using WarpCount = typename Mma::WarpCount;
+
+    // Epilogue
+    using DefaultOutputOp = typename DefaultConfig::EpilogueOutputOp;
+    using DefaultEpilogue = typename DefaultGemm::Epilogue;
+    using OutputTileIterator =
+        typename cutlass::epilogue::threadblock::MakePrefetchableIterator<
+            typename DefaultEpilogue::OutputTileIterator>::Iterator;
+    using AccumTileGmem = GmemTile<typename Mma::FragmentC, (int)kNumThreads>;
+  };
+
+  struct MatmulDOIVJ {
+    /*
+    doi_t_vj = do_i @ v_j.transpose(-2, -1) # matmul
+    tmp = (doi_t_vj - Di.unsqueeze(1)) * attn # inplace / epilogue?
+    */
+    using ThreadblockShape =
+        cutlass::gemm::GemmShape<kBlockSizeI, kBlockSizeJ, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+
+    using ElementC = output_t;
+    using ElementAccum = accum_t;
+
+    // no-op output op - epilogue just stores result to global memory
+    using BiasGradEpilogueOutputOp =
+        typename cutlass::epilogue::thread::LinearCombination<
+            ElementC,
+            DefaultConfig::EpilogueOutputOp::kCount,
+            typename DefaultConfig::EpilogueOutputOp::ElementAccumulator,
+            typename DefaultConfig::EpilogueOutputOp::ElementCompute,
+            cutlass::epilogue::thread::ScaleType::Nothing>;
+
+    using DefaultGemm = typename cutlass::gemm::kernel::DefaultGemm<
+        scalar_t, // ElementA
+        cutlass::layout::RowMajor, // LayoutA
+        kIsAligned ? DefaultConfig::kAlignmentA : GemmType::kMinimumAlignment,
+        scalar_t, // ElementB
+        cutlass::layout::ColumnMajor, // LayoutB
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment,
+        ElementC, // ElementC
+        cutlass::layout::RowMajor, // LayoutC
+        ElementAccum, // ElementAccumulator
+        typename GemmType::OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        typename GemmType::InstructionShape,
+        BiasGradEpilogueOutputOp, // EpilogueOutputOp
+        void, // ThreadblockSwizzle (not used)
+        // multiple preloads, dropout Zij tile, and 3 stages push us over shared
+        // memory capacity on A100. set a ceiling on number of stages to save
+        // shared memory if dropout is in use.
+        kPreload && kApplyDropout && (kBlockSizeI * kBlockSizeJ > 64 * 64)
+            ? cutlass::const_min(2, DefaultConfig::kStages)
+            : DefaultConfig::kStages, // Stages
+        false, // SplitKSerial
+        typename GemmType::Operator,
+        cutlass::gemm::SharedMemoryClearOption::kNone>;
+    using Mma = typename MakeCustomMma<typename DefaultGemm::Mma, kMaxK>::Mma;
+    using AccumLambdaIterator = typename DefaultMmaAccumLambdaIterator<
+        typename Mma::Operator::IteratorC,
+        ElementAccum,
+        kWarpSize>::Iterator;
+
+    // epilogue used to write bias gradient, which is just the output of this
+    // matmul with some operations applied to the fragment
+    using BiasGradEpilogue = typename DefaultGemm::Epilogue;
+
+    // Epilogue to store to shared-memory in a format that we can use later for
+    // the second matmul
+    using B2bGemm = typename cutlass::gemm::threadblock::B2bGemm<
+        typename DefaultGemm::Mma::Operator::IteratorC,
+        typename DefaultGemm::Mma::Operator,
+        scalar_t,
+        WarpShape,
+        ThreadblockShape>;
+    using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage;
+  };
+
+  struct MatmulGradQ {
+    // grad_q <- tmp @ k_j
+    using ThreadblockShape =
+        cutlass::gemm::GemmShape<kBlockSizeI, kBlockSizeJ, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+    using InstructionShape = typename GemmType::InstructionShape;
+
+    using DefaultGemm = cutlass::gemm::kernel::DefaultGemm<
+        scalar_t, // ElementA,
+        cutlass::layout::RowMajor, // LayoutA,
+        DefaultConfig::kAlignmentA,
+        scalar_t, // ElementB,
+        cutlass::layout::RowMajor, // LayoutB,
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment,
+        output_t,
+        cutlass::layout::RowMajor, // LayoutC,
+        accum_t,
+        typename GemmType::OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        typename GemmType::InstructionShape,
+        typename DefaultConfig::EpilogueOutputOp,
+        void, // ThreadblockSwizzle - not used
+        DefaultConfig::kStages,
+        false, // SplitKSerial
+        typename GemmType::Operator>;
+
+    using WarpIteratorA = typename cutlass::gemm::threadblock::
+        DefaultWarpIteratorAFromSharedMemory<
+            typename DefaultGemm::Mma::Operator::Shape,
+            typename DefaultGemm::Mma::Operator::InstructionShape,
+            typename DefaultGemm::Mma::Operator::IteratorA,
+            typename DefaultGemm::Mma::Policy>::WarpIterator;
+    using DefaultMmaFromSmem =
+        typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            MatmulDOIVJ::AccumulatorSharedStorage::Shape::kN,
+            WarpIteratorA,
+            false>; // kScaleOperandA
+    using Mma = typename DefaultMmaFromSmem::Mma;
+    using IteratorB = typename Mma::IteratorB;
+    using WarpCount = typename Mma::WarpCount;
+
+    // Epilogue
+    using DefaultOutputOp = typename DefaultConfig::EpilogueOutputOp;
+    using DefaultEpilogue = typename DefaultGemm::Epilogue;
+    using OutputTileIterator =
+        typename cutlass::epilogue::threadblock::MakePrefetchableIterator<
+            typename DefaultEpilogue::OutputTileIterator>::Iterator;
+    using AccumTileGmem = GmemTile<typename Mma::FragmentC, (int)kNumThreads>;
+  };
+  struct MatmulGradK {
+    // grad_k <- tmp.transpose(-2, -1) @ q_i
+    using ThreadblockShape =
+        cutlass::gemm::GemmShape<kBlockSizeJ, kBlockSizeI, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+    using InstructionShape = typename GemmType::InstructionShape;
+
+    using DefaultGemm = cutlass::gemm::kernel::DefaultGemm<
+        scalar_t, // ElementA,
+        cutlass::layout::RowMajor, // LayoutA,
+        DefaultConfig::kAlignmentA,
+        scalar_t, // ElementB,
+        cutlass::layout::RowMajor, // LayoutB,
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment,
+        output_t,
+        cutlass::layout::RowMajor, // LayoutC,
+        accum_t,
+        typename GemmType::OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        typename GemmType::InstructionShape,
+        typename DefaultConfig::EpilogueOutputOp,
+        void, // ThreadblockSwizzle - not used
+        DefaultConfig::kStages,
+        false, // SplitKSerial
+        typename GemmType::Operator>;
+
+    using WarpIteratorA = typename cutlass::gemm::threadblock::
+        DefaultWarpIteratorAFromSharedMemory<
+            typename DefaultGemm::Mma::Operator::Shape,
+            typename DefaultGemm::Mma::Operator::InstructionShape,
+            typename DefaultGemm::Mma::Operator::IteratorA,
+            typename DefaultGemm::Mma::Policy>::WarpIterator;
+    using DefaultMmaFromSmemN =
+        typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            MatmulQK::AccumulatorSharedStorage::Shape::kN, // kMaxK
+            WarpIteratorA,
+            false>; // kScaleOperandA
+    using DefaultMmaFromSmemT =
+        typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            MatmulDOIVJ::AccumulatorSharedStorage::Shape::kM, // kMaxK
+            WarpIteratorA,
+            false, // kScaleOperandA
+            kPreload>; // kTransposeA
+    using DefaultMmaFromSmem = typename cutlass::platform::conditional<
+        DefaultMmaFromSmemT::kIsTransposedA,
+        DefaultMmaFromSmemT,
+        DefaultMmaFromSmemN>::type;
+    using Mma = typename DefaultMmaFromSmem::Mma;
+    using IteratorB = typename Mma::IteratorB;
+    using WarpCount = typename Mma::WarpCount;
+
+    // Epilogue
+    using DefaultOutputOp = typename DefaultConfig::EpilogueOutputOp;
+    using DefaultEpilogue = typename DefaultGemm::Epilogue;
+    using OutputTileIterator =
+        typename cutlass::epilogue::threadblock::MakePrefetchableIterator<
+            typename DefaultEpilogue::OutputTileIterator>::Iterator;
+    using AccumTileGmem = GmemTile<typename Mma::FragmentC, (int)kNumThreads>;
+  };
+
+  static constexpr bool kEnableSplitKeys = kEnableSplitKeys_;
+
+  static constexpr bool kNeedsAccumGradQ = kEnableSplitKeys ||
+      !cutlass::platform::is_same<output_accum_t, output_t>::value;
+  static constexpr bool kNeedsAccumGradK = !kOutputInRF &&
+      !cutlass::platform::is_same<output_accum_t, output_t>::value;
+  static constexpr bool kNeedsAccumGradV = !kOutputInRF &&
+      !cutlass::platform::is_same<output_accum_t, output_t>::value;
+
+  struct GradQTempStorage {
+    int32_t lock;
+    int32_t counter;
+    int32_t pad[2]; // pad to 128bits
+    output_accum_t buffer[MatmulGradQ::AccumTileGmem::kElementsStored];
+  };
+
+  struct Params {
+    // Input tensors
+    scalar_t* query_ptr = nullptr; // [Mq, nH, K]
+    scalar_t* key_ptr = nullptr; // [Mk, nH, K]
+    scalar_t* value_ptr = nullptr; // [Mk, nH, Kv]
+    scalar_t* bias_ptr = nullptr;
+    lse_scalar_t* logsumexp_ptr = nullptr; // [nH, Mq]
+    scalar_t* output_ptr = nullptr; // [Mq, nH, Kv]
+    scalar_t* grad_output_ptr = nullptr; // [Mq, nH, Kv]
+    accum_t* delta_ptr = nullptr; // [nH, Mq]
+    int32_t* cu_seqlens_q_ptr = nullptr;
+    int32_t* cu_seqlens_k_ptr = nullptr;
+
+    // Output tensors
+    output_t* grad_query_ptr = nullptr; //  [Mq, nH, K]
+    output_t* grad_key_ptr = nullptr; //    [Mk, nH, K]
+    output_t* grad_value_ptr = nullptr; //  [Mk, nH, Kv]
+    output_t* grad_bias_ptr = nullptr;
+
+    // Accumulators
+    output_accum_t* workspace = nullptr; // [Mq, Kq] + [Mkv, Kq] + [Mkv, Kv]
+    output_accum_t* workspace_gv =
+        nullptr; // (will be calculated by the kernel)
+    GradQTempStorage* workspace_gq =
+        nullptr; // (will be calculated by the kernel)
+
+    // Scale
+    accum_t scale = 1.0f;
+
+    // Dimensions/strides
+    int32_t head_dim = -1;
+    int32_t head_dim_value = -1;
+    int32_t num_queries = -1;
+    int32_t num_keys = -1;
+    int32_t num_heads = -1;
+    uint8_t custom_mask_type = NoCustomMask;
+
+    int32_t q_strideM = -1;
+    int32_t k_strideM = -1;
+    int32_t v_strideM = -1;
+    int32_t bias_strideM = 0;
+    int32_t gO_strideM = -1;
+    int32_t gB_strideM = -1;
+    int8_t gQKV_strideM_multiplier = 1; // 3 for packed, 1 otherwise
+
+#ifdef HAS_PYTORCH
+    // dropout
+    at::PhiloxCudaState rng_engine_inputs = {0, 0};
+#endif
+    // RNG sequence offset based on batch_id and head_id
+    unsigned long long dropout_batch_head_rng_offset = 0;
+    float dropout_prob = 0.0f;
+
+    CUTLASS_HOST_DEVICE int32_t o_strideM() const {
+      return head_dim_value * num_heads;
+    }
+    CUTLASS_HOST_DEVICE int32_t gQ_strideM() const {
+      return gQKV_strideM_multiplier * num_heads * head_dim;
+    }
+    CUTLASS_HOST_DEVICE int32_t gK_strideM() const {
+      return gQKV_strideM_multiplier * num_heads * head_dim;
+    }
+    CUTLASS_HOST_DEVICE int32_t gV_strideM() const {
+      return gQKV_strideM_multiplier * num_heads * head_dim_value;
+    }
+
+    // Everything below is only used in `advance_to_block`
+    // and shouldn't use registers
+    int64_t o_strideH = -1;
+    int32_t q_strideH = -1;
+    int32_t k_strideH = -1;
+    int32_t v_strideH = -1;
+    int64_t bias_strideH = 0;
+    int64_t o_strideB = -1;
+    int64_t q_strideB = -1;
+    int64_t k_strideB = -1;
+    int64_t v_strideB = -1;
+    int64_t bias_strideB = 0;
+    int64_t lse_strideB = -1;
+    int64_t lse_strideH = -1;
+    int64_t delta_strideB = -1;
+    int64_t delta_strideH = -1;
+    int32_t num_batches = -1;
+    int16_t num_splits_key = 1; // We use `gridDim.x` inside kernel
+
+    int64_t gO_strideB = 0;
+    int64_t gQ_strideB = 0;
+    int64_t gK_strideB = 0;
+    int64_t gV_strideB = 0;
+    int64_t gB_strideB = 0;
+    int64_t gO_strideH = 0;
+    int64_t gQ_strideH = 0;
+    int64_t gK_strideH = 0;
+    int64_t gV_strideH = 0;
+    int64_t gB_strideH = 0;
+
+    CUTLASS_DEVICE int16_t num_splits_key_device() const {
+      return kEnableSplitKeys ? gridDim.x : 1;
+    }
+    CUTLASS_DEVICE int16_t split_key_device() const {
+      return kEnableSplitKeys ? blockIdx.x : 0;
+    }
+
+    CUTLASS_DEVICE bool advance_to_block() {
+      int64_t batch_id = blockIdx.z;
+      int32_t head_id = blockIdx.y;
+
+      if (kNeedsAccumGradQ || kNeedsAccumGradK || kNeedsAccumGradV) {
+        assert(workspace_size() == 0 || workspace != nullptr);
+
+        workspace += (batch_id * num_heads + head_id) * workspace_strideBH();
+        workspace = warp_uniform(workspace);
+        workspace_gv = workspace + workspace_elements_gk();
+        workspace_gq =
+            (GradQTempStorage*)(workspace_gv + workspace_elements_gv());
+        if (kEnableSplitKeys) {
+          workspace_gv += workspace_elements_gv() * split_key_device() /
+              num_splits_key_device();
+          workspace += workspace_elements_gk() * split_key_device() /
+              num_splits_key_device();
+        }
+      } else {
+        workspace = nullptr;
+      }
+
+      // Advance pointers that depend on the total concatenated
+      // number of queries, as `num_queries` is modified in the block
+      // below
+      dropout_batch_head_rng_offset =
+          batch_id * (num_heads * num_queries * num_keys) +
+          head_id * (num_queries * num_keys);
+      logsumexp_ptr += batch_id * lse_strideB + head_id * lse_strideH;
+
+      if (cu_seqlens_q_ptr != nullptr) {
+        assert(cu_seqlens_k_ptr != nullptr);
+        cu_seqlens_q_ptr += batch_id;
+        cu_seqlens_k_ptr += batch_id;
+        int32_t q_start = cu_seqlens_q_ptr[0];
+        int32_t k_start = cu_seqlens_k_ptr[0];
+        int64_t q_next_start = cu_seqlens_q_ptr[1];
+        int64_t k_next_start = cu_seqlens_k_ptr[1];
+        assert(q_next_start - q_start <= num_queries);
+        assert(k_next_start - k_start <= num_keys);
+        num_queries = q_next_start - q_start;
+        num_keys = k_next_start - k_start;
+
+        // Jump manually
+        batch_id = 0;
+
+        query_ptr += q_start * q_strideM;
+        key_ptr += k_start * k_strideM;
+        value_ptr += k_start * v_strideM;
+        assert(bias_ptr == nullptr);
+        assert(grad_bias_ptr == nullptr);
+        output_ptr += q_start * o_strideM();
+        grad_output_ptr += q_start * gO_strideM;
+        delta_ptr += q_start;
+
+        grad_query_ptr += q_start * gQ_strideM();
+        grad_key_ptr += k_start * gK_strideM();
+        grad_value_ptr += k_start * gV_strideM();
+      }
+
+      query_ptr += batch_id * q_strideB + head_id * q_strideH;
+      key_ptr += batch_id * k_strideB + head_id * k_strideH;
+      value_ptr += batch_id * v_strideB + head_id * v_strideH;
+      if (bias_ptr != nullptr) {
+        bias_ptr += batch_id * bias_strideB + head_id * bias_strideH;
+      }
+      output_ptr += batch_id * o_strideB + head_id * o_strideH;
+      grad_output_ptr += batch_id * gO_strideB + head_id * gO_strideH;
+      delta_ptr += batch_id * delta_strideB + head_id * delta_strideH;
+
+      grad_query_ptr += batch_id * gQ_strideB + head_id * gQ_strideH;
+      grad_key_ptr += batch_id * gK_strideB + head_id * gK_strideH;
+      grad_value_ptr += batch_id * gV_strideB + head_id * gV_strideH;
+      if (grad_bias_ptr != nullptr) {
+        grad_bias_ptr += batch_id * gB_strideB + head_id * gB_strideH;
+      }
+
+      // Some values are modified above
+      // Signal to the compiler that they are the same in all threads
+      // and can be stored in warp-uniform registers (Sm75+)
+      num_queries = warp_uniform(num_queries);
+      num_keys = warp_uniform(num_keys);
+      custom_mask_type = warp_uniform(custom_mask_type);
+
+      query_ptr = warp_uniform(query_ptr);
+      key_ptr = warp_uniform(key_ptr);
+      value_ptr = warp_uniform(value_ptr);
+      bias_ptr = warp_uniform(bias_ptr);
+      logsumexp_ptr = warp_uniform(logsumexp_ptr);
+      output_ptr = warp_uniform(output_ptr);
+      grad_output_ptr = warp_uniform(grad_output_ptr);
+      delta_ptr = warp_uniform(delta_ptr);
+
+      grad_query_ptr = warp_uniform(grad_query_ptr);
+      grad_key_ptr = warp_uniform(grad_key_ptr);
+      grad_value_ptr = warp_uniform(grad_value_ptr);
+      grad_bias_ptr = warp_uniform(grad_bias_ptr);
+
+#if 0
+      PRINT_T0("[b:%d h:%d] dp[0]:%f Q:%f K:%f V:%f LSE:%f",
+        int(blockIdx.z), int(blockIdx.y),
+        float(delta_ptr[0]),
+        float(query_ptr[0]), float(key_ptr[0]), float(value_ptr[0]),
+        float(logsumexp_ptr[0])
+      )
+#endif
+      return true;
+    }
+
+    __host__ dim3 getBlocksGrid() const {
+      return dim3(num_splits_key, num_heads, num_batches);
+    }
+    __host__ dim3 getThreadsGrid() const {
+      return dim3(kWarpSize * kNumWarpsPerBlock, 1, 1);
+    }
+    CUTLASS_HOST_DEVICE int64_t workspace_elements_gk() const {
+      if (!kNeedsAccumGradK) {
+        return 0;
+      }
+      return num_splits_key * align_up(num_keys, (int32_t)kBlockSizeJ) *
+          align_up(head_dim, (int32_t)kBlockSizeI);
+    }
+    CUTLASS_HOST_DEVICE int64_t workspace_elements_gv() const {
+      if (!kNeedsAccumGradV) {
+        return 0;
+      }
+      return num_splits_key * align_up(num_keys, (int32_t)kBlockSizeJ) *
+          align_up(head_dim_value, (int32_t)kBlockSizeI);
+    }
+    CUTLASS_HOST_DEVICE int64_t workspace_elements_gq() const {
+      if (!kNeedsAccumGradQ) {
+        return 0;
+      }
+      int num_blocks = ceil_div(num_queries, kBlockSizeI);
+      int num_cols = ceil_div(head_dim, MatmulGradQ::ThreadblockShape::kN);
+      return num_blocks * num_cols * sizeof(GradQTempStorage) /
+          sizeof(output_accum_t);
+    }
+    CUTLASS_HOST_DEVICE int64_t workspace_strideBH() const {
+      // Aligned on 128bits
+      return align_up(
+          workspace_elements_gk() + workspace_elements_gv() +
+              workspace_elements_gq(),
+          int64_t(4));
+    }
+    CUTLASS_HOST_DEVICE int64_t workspace_size() const {
+      // Returns size of buffer we need to run this kernel
+      return num_batches * num_heads * workspace_strideBH() * sizeof(float);
+    }
+    CUTLASS_HOST_DEVICE bool should_zero_workspace() const {
+      return num_splits_key > 1;
+    }
+  };
+
+  // shared storage for keeping Zij matrix. not needed if we aren't using
+  // dropout, in which case we use an empty array to save shared memory
+  using ZijSharedStorage = typename cutlass::platform::conditional<
+      kApplyDropout,
+      typename MatmulQK::AccumulatorSharedStorage,
+      // dummy shared storage object that takes up no space.
+      typename cutlass::gemm::threadblock::AccumulatorSharedStorage<
+#ifdef _WIN32
+          // windows builds throw the error:
+          // "type containing an unknown-size array is not allowed"
+          // if we try to make Zij shared storage zero-sized.
+          // To get around this just make it sized 1 on windows.
+          typename cutlass::gemm::GemmShape<1, 1, 0>,
+#else
+          typename cutlass::gemm::GemmShape<0, 0, 0>,
+#endif
+          typename MatmulQK::AccumulatorSharedStorage::Element,
+          typename MatmulQK::AccumulatorSharedStorage::Layout,
+          typename cutlass::MatrixShape<0, 0>>>::type;
+
+  struct SharedStoragePrologue {
+    struct {
+      cutlass::Array<accum_t, kBlockSizeI> di; // (do_i * o_i).sum(-1)
+      typename MatmulQK::Mma::SharedStorageA mm_qk_k;
+    } persistent;
+    union {
+      struct {
+        // part1 - after Q.K / dV / dO.V
+        union {
+          // 1. efficient load of bias tile Bij, which is then applied to Pij
+          typename MatmulQK::BiasLoader::SmemTile bias;
+          // 4. store Pij. it is needed:
+          // - in dVj += (Pij.T * Zij) @ dOi
+          // - in dSij = Pij * (dPij - Di)
+          // 6. dVj += (Pij.T * Zij) @ dOi
+          // 10. write to fragment
+          typename MatmulQK::AccumulatorSharedStorage attn_shared_storage;
+        };
+        // 5. store Zij. it is needed in dVj += (Pij.T * Zij) @ dOi
+        ZijSharedStorage zij;
+
+        union {
+          // 2. prologue for dVj
+          // 6. workspace for dVj += (Pij.T * Zij) @ dOi
+          typename MatmulGradV::Mma::SharedStorage mm_gradV;
+          // 7. dVj epilogue
+          typename MatmulGradV::DefaultEpilogue::SharedStorage gradV_epilogue;
+        };
+
+        // 3. prologue for dPij_dropped
+        // 8. used in dPij_dropped = dOi @ Vj.T
+        typename MatmulDOIVJ::Mma::SharedStorage mm_doivj;
+      } part1;
+
+      struct {
+        // part2 - dQ
+        union {
+          typename MatmulQK::AccumulatorSharedStorage
+              tmpT_shared_storage; // (from part1)
+          typename MatmulDOIVJ::AccumulatorSharedStorage tmp_shared_storage;
+        };
+        typename MatmulGradK::Mma::SharedStorage mm_gradK; // (preload)
+        typename MatmulGradQ::Mma::SharedStorage mm_gradQ; // (preload)
+        union {
+          // store dB = dSij to global memory
+          typename MatmulDOIVJ::BiasGradEpilogue::SharedStorage gradB_epilogue;
+          typename MatmulGradQ::DefaultEpilogue::SharedStorage gradQ_epilogue;
+        };
+
+      } part2;
+
+      struct {
+        // part3 - after last iteration on dQ's epilogue / dK
+        union {
+          typename MatmulQK::AccumulatorSharedStorage
+              tmpT_shared_storage; // (from part1)
+          typename MatmulDOIVJ::AccumulatorSharedStorage tmp_shared_storage;
+        };
+        typename MatmulGradK::Mma::SharedStorage mm_gradK; // (preload)
+        typename MatmulGradQ::DefaultEpilogue::SharedStorage
+            gradQ_epilogue_lastIter;
+
+        typename MatmulGradK::DefaultEpilogue::SharedStorage gradK_epilogue;
+      } part3;
+
+      struct {
+        // part4 - after last iteration on dK's epilogue / preload next K.Q_t
+        typename MatmulQK::Mma::SharedStorageB mm_qk_q;
+
+        // If we reach end of current key, dump RF->gmem with "final" epilogues
+        typename MatmulGradK::DefaultEpilogue::SharedStorage
+            gradK_epilogue_final;
+        typename MatmulGradV::DefaultEpilogue::SharedStorage
+            gradV_epilogue_final;
+      } part4;
+    };
+    static void print_size() {
+      // Field size
+#define FSZ(f) int((sizeof(((SharedStoragePrologue*)0)->f)))
+
+      printf("Total smem: %d bytes\n", int(sizeof(SharedStoragePrologue)));
+      printf("  persistent: %db\n", FSZ(persistent));
+      printf("    mm_qk_k: %db\n", FSZ(persistent.mm_qk_k));
+      printf("  part1: %db\n", FSZ(part1));
+      printf("    bias: %db\n", FSZ(part1.bias));
+      printf("    attn_shared_storage: %db\n", FSZ(part1.attn_shared_storage));
+      printf("    zij: %db\n", FSZ(part1.zij));
+      printf("    mm_gradV: %db\n", FSZ(part1.mm_gradV));
+      printf("    gradV_epilogue: %db\n", FSZ(part1.gradV_epilogue));
+      printf("    mm_doivj: %db\n", FSZ(part1.mm_doivj));
+      printf("  part2: %db\n", FSZ(part2));
+      printf("    tmpT_shared_storage: %db\n", FSZ(part2.tmpT_shared_storage));
+      printf("    tmp_shared_storage: %db\n", FSZ(part2.tmp_shared_storage));
+      printf("    mm_gradK: %db\n", FSZ(part2.mm_gradK));
+      printf("    mm_gradQ: %db\n", FSZ(part2.mm_gradQ));
+      printf("    gradB_epilogue: %db\n", FSZ(part2.gradB_epilogue));
+      printf("    gradQ_epilogue: %db\n", FSZ(part2.gradQ_epilogue));
+      printf("  part3: %db\n", FSZ(part3));
+      printf("    tmpT_shared_storage: %db\n", FSZ(part3.tmpT_shared_storage));
+      printf("  part4: %db\n", FSZ(part4));
+      printf("    mm_qk_q: %db\n", FSZ(part4.mm_qk_q));
+      printf(
+          "    gradK_epilogue_final: %db\n", FSZ(part4.gradK_epilogue_final));
+      printf(
+          "    gradV_epilogue_final: %db\n", FSZ(part4.gradV_epilogue_final));
+    }
+// ===========================================
+#define FIELD(INSIDE_STRUCT, FIELDNAME) \
+  CUTLASS_DEVICE auto& FIELDNAME() {    \
+    return INSIDE_STRUCT.FIELDNAME;     \
+  }
+
+    FIELD(persistent, di)
+    FIELD(persistent, mm_qk_k)
+    FIELD(part1, bias)
+    FIELD(part1, attn_shared_storage)
+    FIELD(part1, zij)
+    FIELD(part1, mm_gradV)
+    FIELD(part1, gradV_epilogue)
+    FIELD(part1, mm_doivj)
+    FIELD(part2, mm_gradK)
+    FIELD(part2, mm_gradQ)
+    FIELD(part2, gradB_epilogue)
+    FIELD(part2, gradQ_epilogue)
+    FIELD(part2, tmp_shared_storage)
+    FIELD(part3, tmpT_shared_storage)
+    FIELD(part3, gradQ_epilogue_lastIter)
+    FIELD(part3, gradK_epilogue)
+    FIELD(part4, mm_qk_q)
+    FIELD(part4, gradK_epilogue_final)
+    FIELD(part4, gradV_epilogue_final)
+  };
+
+  struct SharedStorageNoPrologue {
+    struct {
+      cutlass::Array<accum_t, kBlockSizeI> di; // (do_i * o_i).sum(-1)
+    } persistent;
+    union {
+      struct {
+        // part1 - Q.K matmul
+        typename MatmulQK::Mma::SharedStorageA mm_qk_k;
+        typename MatmulQK::Mma::SharedStorageB mm_qk_q;
+      } part1;
+
+      struct {
+        // part2 - compute gradV
+        union {
+          // 1. efficient load of bias tile Bij, which is then applied to Pij
+          typename MatmulQK::BiasLoader::SmemTile bias;
+          // 2. store Pij to shared memory. it is needed:
+          // - in this step, where it is used in dVj += (Pij.T * Zij) @ dOi
+          // - in next step where it is used in dSij = Pij * (dPij - Di)
+          typename MatmulQK::AccumulatorSharedStorage attn_shared_storage;
+        };
+        // 3. store Zij. it is needed in this step, where it is used
+        // to compute Pij_dropped = Pij * Zij on the fly as fragments of Pij are
+        // loaded for the computation of dVj.
+        ZijSharedStorage zij;
+
+        union {
+          typename MatmulGradV::Mma::SharedStorage mm_gradV;
+          typename MatmulGradV::DefaultEpilogue::SharedStorage gradV_epilogue;
+        };
+      } part2;
+
+      struct {
+        // part3 - DO.V matmul
+        union {
+          // first compute dPij = (dOi @ Vj.T) * Zij
+          // and dSij = Pij * (dPij - Di)
+          struct {
+            // (from part2) - Pij for computing dSij = Pij * (dPij - Di)
+            typename MatmulQK::AccumulatorSharedStorage attn_shared_storage;
+            // matmul to compute dOiVj
+            typename MatmulDOIVJ::Mma::SharedStorage mm_doivj;
+          };
+          // then store dB = dSij to global memory
+          typename MatmulDOIVJ::BiasGradEpilogue::SharedStorage gradB_epilogue;
+        };
+      } part3;
+
+      struct {
+        // part4 - compute gradQ
+        typename MatmulQK::AccumulatorSharedStorage
+            tmpT_shared_storage; // (from part2)
+        typename MatmulDOIVJ::AccumulatorSharedStorage tmp_shared_storage;
+        union {
+          typename MatmulGradQ::Mma::SharedStorage mm_gradQ;
+          typename MatmulGradQ::DefaultEpilogue::SharedStorage gradQ_epilogue;
+          typename MatmulGradQ::DefaultEpilogue::SharedStorage
+              gradQ_epilogue_lastIter;
+        };
+      } part4;
+
+      struct {
+        // part5 - compute gradK
+        typename MatmulQK::AccumulatorSharedStorage
+            tmpT_shared_storage; // (from part2)
+        typename MatmulDOIVJ::AccumulatorSharedStorage tmp_shared_storage;
+        union {
+          typename MatmulGradK::Mma::SharedStorage mm_gradK;
+          typename MatmulGradK::DefaultEpilogue::SharedStorage gradK_epilogue;
+        };
+      } part5;
+
+      struct {
+        // part6 - store RF accumulated into gmem
+        typename MatmulGradK::DefaultEpilogue::SharedStorage
+            gradK_epilogue_final;
+        typename MatmulGradV::DefaultEpilogue::SharedStorage
+            gradV_epilogue_final;
+      } part6;
+    };
+    static void print_size() {
+#define FIELD_SIZEOF(f) int((sizeof(((SharedStorageNoPrologue*)0)->f)))
+      printf("Total smem: %d bytes\n", int(sizeof(SharedStorageNoPrologue)));
+      printf("  persistent: %db\n", FIELD_SIZEOF(persistent));
+      printf("  part1: %db\n", FIELD_SIZEOF(part1));
+      printf("  part2: %db\n", FIELD_SIZEOF(part2));
+      printf("  part3: %db\n", FIELD_SIZEOF(part3));
+      printf("  part4: %db\n", FIELD_SIZEOF(part4));
+      printf("  part5: %db\n", FIELD_SIZEOF(part5));
+      printf("  part6: %db\n", FIELD_SIZEOF(part6));
+    }
+// ===========================================
+#define FIELD(INSIDE_STRUCT, FIELDNAME) \
+  CUTLASS_DEVICE auto& FIELDNAME() {    \
+    return INSIDE_STRUCT.FIELDNAME;     \
+  }
+
+    FIELD(persistent, di)
+    FIELD(part1, mm_qk_k)
+    FIELD(part1, mm_qk_q)
+    FIELD(part2, bias)
+    FIELD(part2, attn_shared_storage)
+    FIELD(part2, zij)
+    FIELD(part2, mm_gradV)
+    FIELD(part2, gradV_epilogue)
+    FIELD(part3, mm_doivj)
+    FIELD(part3, gradB_epilogue)
+    FIELD(part4, tmpT_shared_storage)
+    FIELD(part4, tmp_shared_storage)
+    FIELD(part4, mm_gradQ)
+    FIELD(part4, gradQ_epilogue)
+    FIELD(part4, gradQ_epilogue_lastIter)
+    FIELD(part5, mm_gradK)
+    FIELD(part5, gradK_epilogue)
+    FIELD(part6, gradK_epilogue_final)
+    FIELD(part6, gradV_epilogue_final)
+  };
+
+  using SharedStorage = typename cutlass::platform::conditional<
+      kPreload,
+      SharedStoragePrologue,
+      SharedStorageNoPrologue>::type;
+
+  struct OutputFragments {
+    typename MatmulGradV::Mma::FragmentC gradV;
+    typename MatmulGradK::Mma::FragmentC gradK;
+
+    CUTLASS_DEVICE void clear() {
+      gradV.clear();
+      gradK.clear();
+    }
+  };
+
+  static bool __host__ check_supported(Params const& p) {
+    CHECK_ALIGNED_PTR(p.query_ptr, kMinimumAlignment);
+    CHECK_ALIGNED_PTR(p.key_ptr, kMinimumAlignment);
+    CHECK_ALIGNED_PTR(p.value_ptr, kMinimumAlignment);
+    CHECK_ALIGNED_PTR(p.output_ptr, kMinimumAlignment);
+    CHECK_ALIGNED_PTR(p.grad_output_ptr, kMinimumAlignment);
+    CHECK_ALIGNED_PTR(p.bias_ptr, kMinimumAlignment);
+    XFORMERS_CHECK(p.lse_strideH % 8 == 0, "LSE is not correctly aligned");
+    XFORMERS_CHECK(p.lse_strideB % 8 == 0, "LSE is not correctly aligned");
+    XFORMERS_CHECK(
+        p.num_heads <= 1 || p.q_strideH % kMinimumAlignment == 0,
+        "query is not correctly aligned (strideH)");
+    XFORMERS_CHECK(
+        p.num_heads <= 1 || p.k_strideH % kMinimumAlignment == 0,
+        "key is not correctly aligned (strideH)");
+    XFORMERS_CHECK(
+        p.num_heads <= 1 || p.v_strideH % kMinimumAlignment == 0,
+        "value is not correctly aligned (strideH)");
+    XFORMERS_CHECK(
+        p.num_batches <= 1 || p.q_strideB % kMinimumAlignment == 0,
+        "query is not correctly aligned (strideB)");
+    XFORMERS_CHECK(
+        p.num_batches <= 1 || p.k_strideB % kMinimumAlignment == 0,
+        "key is not correctly aligned (strideB)");
+    XFORMERS_CHECK(
+        p.num_batches <= 1 || p.v_strideB % kMinimumAlignment == 0,
+        "value is not correctly aligned (strideB)");
+    XFORMERS_CHECK(
+        p.q_strideM % kMinimumAlignment == 0,
+        "query is not correctly aligned (strideM)");
+    XFORMERS_CHECK(
+        p.k_strideM % kMinimumAlignment == 0,
+        "key is not correctly aligned (strideM)");
+    XFORMERS_CHECK(
+        p.v_strideM % kMinimumAlignment == 0,
+        "value is not correctly aligned (strideM)");
+    if (p.bias_ptr) {
+      XFORMERS_CHECK(
+          p.num_batches <= 1 || p.bias_strideB % kMinimumAlignment == 0,
+          "attn_bias is not correctly aligned (strideB)");
+      XFORMERS_CHECK(
+          p.num_heads <= 1 || p.bias_strideH % kMinimumAlignment == 0,
+          "attn_bias is not correctly aligned (strideH)");
+      XFORMERS_CHECK(
+          p.bias_strideM % kMinimumAlignment == 0,
+          "attn_bias is not correctly aligned (strideM)");
+    }
+    if (p.grad_bias_ptr) {
+      XFORMERS_CHECK(
+          p.num_batches <= 1 || p.gB_strideB % kMinimumAlignment == 0,
+          "attn_bias.grad is not correctly aligned (strideB)");
+      XFORMERS_CHECK(
+          p.num_heads <= 1 || p.gB_strideH % kMinimumAlignment == 0,
+          "attn_bias.grad is not correctly aligned (strideH)");
+      XFORMERS_CHECK(
+          p.gB_strideM % kMinimumAlignment == 0,
+          "attn_bias.grad is not correctly aligned (strideM)");
+    }
+    XFORMERS_CHECK(
+        !(p.cu_seqlens_q_ptr && p.bias_ptr),
+        "CuSeqlen + bias not implemented yet");
+    XFORMERS_CHECK(
+        p.custom_mask_type < NumCustomMaskTypes,
+        "Invalid value for `custom_mask_type`");
+    XFORMERS_CHECK(
+        p.dropout_prob <= 1.0f && p.dropout_prob >= 0.0f,
+        "Invalid value for `dropout_prob`");
+    XFORMERS_CHECK(
+        kApplyDropout || p.dropout_prob == 0.0f,
+        "Set `kApplyDropout`=True to support `dropout_prob > 0`");
+    XFORMERS_CHECK(p.head_dim > 0, "Invalid value for `head_dim`");
+    XFORMERS_CHECK(p.head_dim_value > 0, "Invalid value for `head_dim_value`");
+    XFORMERS_CHECK(p.num_queries > 0, "Invalid value for `num_queries`");
+    XFORMERS_CHECK(p.num_keys > 0, "Invalid value for `num_keys`");
+    XFORMERS_CHECK(p.num_heads > 0, "Invalid value for `num_heads`");
+    XFORMERS_CHECK(p.num_batches > 0, "Invalid value for `num_batches`");
+    XFORMERS_CHECK(p.head_dim <= kMaxK, "kMaxK: Expected `head_dim < kMaxK`");
+    XFORMERS_CHECK(
+        p.head_dim_value <= kMaxK, "kMaxK: Expected `head_dim_value < kMaxK`");
+    if (kKeysQueriesAlignedToBlockSize) {
+      XFORMERS_CHECK(
+          p.cu_seqlens_k_ptr == nullptr,
+          "This kernel does not support cu_seqlen");
+      XFORMERS_CHECK(
+          p.cu_seqlens_q_ptr == nullptr,
+          "This kernel does not support cu_seqlen");
+      XFORMERS_CHECK(
+          p.num_queries % kBlockSizeI == 0,
+          "kKeysQueriesAlignedToBlockSize condition not respected");
+      XFORMERS_CHECK(
+          p.num_keys % kBlockSizeJ == 0,
+          "kKeysQueriesAlignedToBlockSize condition not respected");
+    }
+    XFORMERS_CHECK(
+        kEnableSplitKeys || p.num_splits_key == 1, "SplitKeys is disabled");
+    XFORMERS_CHECK(
+        p.num_splits_key > 0, "Invalid `num_splits_key` (expected >0)");
+    XFORMERS_CHECK(
+        p.num_splits_key <= cutlass::ceil_div(p.num_keys, kBlockSizeJ),
+        "Invalid `num_splits_key` (too large)");
+    return true;
+  }
+
+  static CUTLASS_DEVICE void attention_kernel(Params p) {
+    extern __shared__ char smem_buffer[];
+    SharedStorage& shared_storage = *((SharedStorage*)smem_buffer);
+
+    uint16_t thread_id = threadIdx.x;
+    uint8_t warp_id = warp_uniform(thread_id / 32);
+    uint8_t lane_id = thread_id % 32;
+
+    int32_t key_start = p.split_key_device() * kBlockSizeJ;
+    if (key_start >= p.num_keys) {
+      return;
+    }
+    if (kPrologueQK) {
+      int32_t query_start = getQueryStart(p, key_start);
+      prologueQkNextIteration<true>(
+          shared_storage, p, query_start, key_start, warp_id, lane_id);
+    }
+
+    // Computes (dO*out).sum(-1) and writes it to `p.delta_ptr`
+    if (kKernelComputesDelta) {
+      constexpr int kOptimalElements =
+          128 / cutlass::sizeof_bits<scalar_t>::value;
+      if (p.head_dim_value % kOptimalElements == 0) {
+        for (int query_start = 0; query_start < p.num_queries;
+             query_start += kBlockSizeI) {
+          computeDelta<kOptimalElements>(p, query_start, warp_id, lane_id);
+        }
+      } else {
+        for (int query_start = 0; query_start < p.num_queries;
+             query_start += kBlockSizeI) {
+          computeDelta<1>(p, query_start, warp_id, lane_id);
+        }
+      }
+      __syncthreads();
+    }
+
+    OutputFragments output_frags;
+
+    curandStatePhilox4_32_10_t rng_state_init;
+#ifdef HAS_PYTORCH
+    if (kApplyDropout) {
+      auto seeds = at::cuda::philox::unpack(p.rng_engine_inputs);
+      // each element of the attention matrix P with shape
+      // (batch_sz, n_heads, n_queries, n_keys) is associated with a single
+      // offset in RNG sequence. we initialize the RNG state with offset that
+      // starts at the beginning of a (n_queries, n_keys) matrix for this
+      // block's batch_id and head_id
+      // initializing rng state is very expensive, so we run once per kernel,
+      // rather than once per iteration. each iteration takes a copy of the
+      // initialized RNG state and offsets it as needed.
+      curand_init(
+          std::get<0>(seeds),
+          0,
+          std::get<1>(seeds) + p.dropout_batch_head_rng_offset,
+          &rng_state_init);
+    }
+#endif
+    CUTLASS_PRAGMA_UNROLL
+    for (; key_start < p.num_keys;
+         key_start += p.num_splits_key_device() * kBlockSizeJ) {
+      output_frags.clear();
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int32_t query_start_shifted = getQueryStart(p, key_start);
+           query_start_shifted < getQueryStartShift(p) + getQueryEnd(p);
+           query_start_shifted += kBlockSizeI) {
+        // This line here
+        // vvvvvvvvvvvvvv
+        warp_id = warp_uniform(warp_id);
+        // ^^^^^^^^^^^^^^
+        // ... makes everything use less RF and be 10% faster. Why?
+        // I don't know. My theory is that it forces `nvcc` to
+        // re-compute indices, offsets etc... and not keep them
+        // from the previous iteration, which prevents MASSIVE
+        // register spilling.
+
+        int32_t query_start = query_start_shifted;
+        if (query_start >= p.num_queries) {
+          query_start = query_start % getQueryEnd(p);
+        }
+
+        processBlockIJ<kKeysQueriesAlignedToBlockSize>(
+            shared_storage,
+            output_frags,
+            p,
+            query_start,
+            key_start,
+            rng_state_init,
+            warp_id,
+            lane_id);
+      }
+      if (kOutputInRF) {
+        writeFragsToGmem<kKeysQueriesAlignedToBlockSize>(
+            shared_storage, output_frags, p, key_start, warp_id, lane_id);
+      } else if (getQueryStart(p, key_start) >= p.num_queries) {
+        zfillGradKV<kKeysQueriesAlignedToBlockSize>(
+            p, key_start, warp_id, lane_id);
+      }
+      __syncthreads();
+    }
+  }
+
+  template <bool skipBoundsChecks>
+  static CUTLASS_DEVICE void zfillGradKV(
+      Params const& p,
+      int32_t key_start,
+      uint8_t warp_id,
+      uint8_t lane_id) {
+    constexpr int kThreadsPerKey = 8;
+    constexpr int kParallelKeys = kNumThreads / kThreadsPerKey;
+    static_assert(kBlockSizeJ % kParallelKeys == 0, "");
+    // This function is not really optimized, but should rarely be used
+    // It's only used when some keys are "useless" and don't attend to
+    // any query, due to causal masking
+
+    int thread_id = 32 * warp_id + lane_id;
+    int k_shift = lane_id % kThreadsPerKey;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < kBlockSizeJ; j += kParallelKeys) {
+      int key = key_start + j + (thread_id / kThreadsPerKey);
+      if (!skipBoundsChecks && key >= p.num_keys) {
+        continue;
+      }
+      auto gv_ptr = p.grad_value_ptr + key * p.gV_strideM();
+      auto gk_ptr = p.grad_key_ptr + key * p.gK_strideM();
+
+      for (int k = k_shift; k < p.head_dim_value; k += kThreadsPerKey) {
+        gv_ptr[k] = scalar_t(0);
+      }
+      for (int k = k_shift; k < p.head_dim; k += kThreadsPerKey) {
+        gk_ptr[k] = scalar_t(0);
+      }
+    }
+  }
+
+  template <bool skipBoundsChecks>
+  static CUTLASS_DEVICE void processBlockIJ(
+      SharedStorage& shared_storage,
+      OutputFragments& output_frags,
+      Params& p,
+      int32_t query_start,
+      int32_t key_start,
+      const curandStatePhilox4_32_10_t& curand_state_init,
+      uint8_t warp_id,
+      uint8_t lane_id) {
+    cutlass::Array<cutlass::uint1b_t, MatmulDOIVJ::Mma::FragmentC::kElements>
+        dropout_keep_mask_doivj;
+    dropout_keep_mask_doivj.fill(cutlass::uint1b_t{1});
+    const float dropout_scale =
+        kApplyDropout ? 1.0 / (1.0 - p.dropout_prob) : 1.0f;
+
+    cutlass::MatrixCoord no_offset{0, 0};
+    accum_t scale = p.scale;
+    int16_t thread_id = 32 * warp_id + lane_id;
+
+    auto rematerializeThreadIds = [&]() {
+      // Prevents `nvcc` from keeping values deduced from
+      // `thread_id`, `warp_id`, ... in RF - to reduce register pressure
+      warp_id = warp_uniform(thread_id / 32);
+      lane_id = thread_id % 32;
+      thread_id = 32 * warp_id + lane_id;
+    };
+
+    bool isFirstQuery = (query_start == getQueryStart(p, key_start));
+    int32_t next_query, next_key;
+    incrIteration(p, query_start, key_start, next_query, next_key);
+    bool isLastQuery = next_key != key_start;
+
+    accum_t di_rf = accum_t(0);
+    if (thread_id < kBlockSizeI) {
+      if (query_start + thread_id < p.num_queries) {
+        di_rf = p.delta_ptr[query_start + thread_id];
+      }
+      shared_storage.di()[thread_id] = di_rf;
+    }
+
+    int32_t num_queries_in_block = skipBoundsChecks
+        ? MatmulQK::Mma::Shape::kN
+        : warp_uniform(cutlass::fast_min(
+              (int32_t)MatmulQK::Mma::Shape::kN, p.num_queries - query_start));
+    int32_t num_keys_in_block = skipBoundsChecks
+        ? MatmulQK::Mma::Shape::kM
+        : warp_uniform(cutlass::fast_min(
+              (int32_t)MatmulQK::Mma::Shape::kM, p.num_keys - key_start));
+
+    auto prologueGradV = [&](int col) {
+      typename MatmulGradV::Mma::IteratorB iterator_dO(
+          {int32_t(p.gO_strideM)},
+          p.grad_output_ptr + query_start * p.gO_strideM + col,
+          {num_queries_in_block, p.head_dim_value - col},
+          thread_id,
+          no_offset);
+      MatmulGradV::Mma::prologue(
+          shared_storage.mm_gradV(),
+          iterator_dO,
+          thread_id,
+          num_queries_in_block);
+    };
+    auto prologueGradQ = [&](int col) {
+      typename MatmulGradQ::Mma::IteratorB iterator_K(
+          {int32_t(p.k_strideM)},
+          p.key_ptr + key_start * p.k_strideM + col,
+          {num_keys_in_block, p.head_dim - col},
+          thread_id,
+          no_offset);
+      MatmulGradQ::Mma::prologue(
+          shared_storage.mm_gradQ(), iterator_K, thread_id, num_keys_in_block);
+    };
+    auto prologueGradK = [&](int col) {
+      typename MatmulGradK::Mma::IteratorB iterator_Q(
+          {int32_t(p.q_strideM)},
+          p.query_ptr + query_start * p.q_strideM + col,
+          {num_queries_in_block, p.head_dim - col},
+          thread_id,
+          no_offset);
+      MatmulGradK::Mma::prologue(
+          shared_storage.mm_gradK(),
+          iterator_Q,
+          thread_id,
+          num_queries_in_block);
+    };
+    auto prologueDOV = [&]() {
+      typename MatmulDOIVJ::Mma::IteratorA iterator_A(
+          {int32_t(p.gO_strideM)},
+          p.grad_output_ptr + query_start * p.gO_strideM,
+          {num_queries_in_block, p.head_dim_value},
+          thread_id,
+          no_offset);
+      typename MatmulDOIVJ::Mma::IteratorB iterator_B(
+          {int32_t(p.v_strideM)},
+          p.value_ptr + key_start * p.v_strideM,
+          {p.head_dim_value, num_keys_in_block},
+          thread_id,
+          no_offset);
+      MatmulDOIVJ::Mma::prologue(
+          shared_storage.mm_doivj(),
+          iterator_A,
+          iterator_B,
+          thread_id,
+          p.head_dim_value);
+    };
+
+    /////////////////////////////////////////////////////////////////////////////////////////////////
+    // MatmulQK
+    /////////////////////////////////////////////////////////////////////////////////////////////////
+    {
+      using Mma = typename MatmulQK::Mma;
+
+      cutlass::gemm::GemmCoord problem_size(
+          num_keys_in_block,
+          num_queries_in_block,
+          p.head_dim // k
+      );
+
+      // k_j
+      typename Mma::IteratorA iterator_A(
+          {int32_t(p.k_strideM)},
+          p.key_ptr + key_start * p.k_strideM,
+          {problem_size.m(), problem_size.k()},
+          thread_id,
+          no_offset);
+
+      // q_i.transpose(-2, -1)
+      typename Mma::IteratorB iterator_B(
+          {int32_t(p.q_strideM)},
+          p.query_ptr + query_start * p.q_strideM,
+          {problem_size.k(), problem_size.n()},
+          thread_id,
+          no_offset);
+
+      Mma mma(
+          shared_storage.mm_qk_k(),
+          shared_storage.mm_qk_q(),
+          thread_id,
+          warp_id,
+          lane_id);
+
+      typename Mma::FragmentC accum;
+
+      accum.clear();
+
+      auto gemm_k_iterations =
+          (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Compute threadblock-scoped matrix multiply-add
+      mma.set_prologue_done(kPrologueQK);
+      mma.set_zero_outside_bounds(!skipBoundsChecks);
+      mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
+      accum = cutlass::multiplies<typename Mma::FragmentC>()(scale, accum);
+
+      // Epilogue: add LSE + exp and store that to our shared memory buffer
+      // shmem <- (matmul_result -
+      // logsumexp[i_start:i_end].unsqueeze(1)).exp()
+      int warp_idx_mn_0 =
+          warp_id % (Mma::Base::WarpCount::kM * Mma::Base::WarpCount::kN);
+      auto output_tile_coords = cutlass::MatrixCoord{
+          warp_idx_mn_0 % Mma::Base::WarpCount::kM,
+          warp_idx_mn_0 / Mma::Base::WarpCount::kM};
+
+      // apply bias if applicable
+      if (p.bias_ptr != nullptr) {
+        // load bias tile Bij into shared memory
+        typename MatmulQK::BiasLoader::GmemTileIterator bias_iter(
+            {cutlass::layout::RowMajor(p.bias_strideM)},
+            p.bias_ptr + query_start * p.bias_strideM + key_start,
+            {num_queries_in_block, num_keys_in_block},
+            thread_id);
+        cutlass::TensorRef<scalar_t, cutlass::layout::RowMajor> bias_tensor_ref(
+            shared_storage.bias().data(),
+            cutlass::layout::RowMajor(MatmulQK::ThreadblockShape::kM));
+        typename MatmulQK::BiasLoader::SmemTileIterator smem_tile_iter(
+            bias_tensor_ref, thread_id);
+        MatmulQK::BiasLoader::load(bias_iter, smem_tile_iter);
+
+        // Pij += Bij, where Pij is in register fragment and Bij is in shmem
+        auto lane_offset = MatmulQK::AccumLambdaIterator::get_lane_offset(
+            lane_id, warp_id, output_tile_coords);
+        MatmulQK::AccumLambdaIterator::iterateRows(
+            lane_offset,
+            [&](int accum_n) {},
+            [&](int accum_m, int accum_n, int idx) {
+              // remember we are transposed
+              accum[idx] += bias_tensor_ref.at({accum_n, accum_m});
+            },
+            [&](int accum_n) {});
+      }
+
+      // Apply mask
+      if (p.custom_mask_type == CausalFromTopLeft ||
+          p.custom_mask_type == CausalFromBottomRight) {
+        auto lane_offset = MatmulQK::AccumLambdaIterator::get_lane_offset(
+            lane_id, warp_id, output_tile_coords);
+        int shift = query_start - key_start;
+        if (p.custom_mask_type == CausalFromBottomRight) {
+          shift += p.num_keys - p.num_queries;
+        }
+        // current_key = key_start + accum_m
+        // current_query = query_start + accum_n
+        // mask if: `current_key > current_query`
+        MatmulQK::AccumLambdaIterator::iterateRows(
+            lane_offset,
+            [&](int accum_m) {},
+            [&](int accum_m, int accum_n, int idx) {
+              if (accum_m > accum_n + shift) {
+                accum[idx] =
+                    -cutlass::platform::numeric_limits<accum_t>::infinity();
+              }
+            },
+            [&](int accum_m) {});
+      }
+
+      __syncthreads();
+      if (kPrologueGV) {
+        prologueGradV(0);
+      }
+      if (kPrologueDOV) {
+        prologueDOV();
+      }
+
+      MatmulQK::B2bGemm::accumApplyLSEToSmem(
+          shared_storage.attn_shared_storage(),
+          accum,
+          p.logsumexp_ptr + query_start,
+          problem_size.n(),
+          thread_id,
+          warp_id,
+          lane_id,
+          output_tile_coords);
+#if 0
+      auto accum_ref_attnT = shared_storage.attn_shared_storage().accum_ref();
+      PRINT_TENSOR4x4_T0_L0("attn_T", accum_ref_attnT);
+#endif
+
+      // if we are using dropout, compute Zij, writing it to shared memory.
+      // each element of Zij is:
+      // - 0 with probability dropout_p
+      // - 1 / (1 - dropout_p) with probability 1 - dropout_p
+      if (kApplyDropout) {
+        auto zij = shared_storage.zij().accum_ref();
+        // each thread generates a contiguous sequence of elements in Zij, all
+        // in the same row. the reason they have to come from the same row is
+        // that sampling random numbers from a contiguous random number sequence
+        // is much more efficient than jumping around, and the linear offset of
+        // each element of Z (the global matrix) maps to an offset in a random
+        // number sequence. for Z, the end of a row and the beginning of the
+        // next have adjacent offsets, but for Zij (tile of global matrix), this
+        // is not necessarily the case.
+        // We must fill the entire `zij` shmem with values (even out of bounds
+        // on the K-dimension) otherwise we can get NaNs during the GEMM
+        const int kQueriesPerBlock = kBlockSizeI;
+        const int threads_per_row = cutlass::fast_min(
+            int32_t(kNumThreads / kQueriesPerBlock), num_keys_in_block);
+        const int elts_per_thread = cutlass::round_nearest(
+            cutlass::ceil_div(num_keys_in_block, threads_per_row), 4);
+
+        const int thread_i = thread_id / threads_per_row;
+        const int thread_start_j =
+            (thread_id % threads_per_row) * elts_per_thread;
+
+        if (thread_i < kQueriesPerBlock && thread_start_j < num_keys_in_block) {
+          curandStatePhilox4_32_10_t curand_state = curand_state_init;
+          skipahead(
+              (query_start + thread_i) * p.num_keys +
+                  (key_start + thread_start_j),
+              &curand_state);
+
+          // generate elements of Zij, 4 elements at a time
+          for (int zij_start_col_idx = thread_start_j; zij_start_col_idx <
+               cutlass::fast_min<int32_t>(thread_start_j + elts_per_thread,
+                                          num_keys_in_block);
+               zij_start_col_idx += 4) {
+            const float4 rand_uniform_quad = curand_uniform4(&curand_state);
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int quad_idx = 0; quad_idx < 4; ++quad_idx) {
+              // we'll write Zij transposed since attention is also transposed
+              // during the matmul to compute dV.
+              zij.at({zij_start_col_idx + quad_idx /*k*/, thread_i /*q*/}) =
+                  (&rand_uniform_quad.x)[quad_idx] > p.dropout_prob
+                  ? scalar_t(dropout_scale)
+                  : scalar_t(0);
+            }
+          }
+        }
+        __syncthreads();
+#if 0
+        PRINT_TENSOR4x4_T0_L0("zij", zij);
+        PRINT_TENSOR4x4_T0_L0_START("zij", zij, kBlockSizeJ - 4, kBlockSizeI - 4);
+#endif
+
+        // Save mask for later DOIVJ matmul
+
+        int warp_idx_mn_0 = warp_id %
+            (MatmulDOIVJ::Mma::Base::WarpCount::kM *
+             MatmulDOIVJ::Mma::Base::WarpCount::kN);
+        auto output_tile_coords_doivj = cutlass::MatrixCoord{
+            warp_idx_mn_0 % MatmulDOIVJ::Mma::Base::WarpCount::kM,
+            warp_idx_mn_0 / MatmulDOIVJ::Mma::Base::WarpCount::kM};
+        auto lane_offset = MatmulDOIVJ::AccumLambdaIterator::get_lane_offset(
+            lane_id, warp_id, output_tile_coords_doivj);
+        MatmulDOIVJ::AccumLambdaIterator::iterateRows(
+            lane_offset,
+            [&](int accum_m) {},
+            [&](int accum_m /*q*/, int accum_n /*k*/, int idx) {
+              if (zij.at({accum_n, accum_m}) == scalar_t(0)) {
+                dropout_keep_mask_doivj[idx] = cutlass::uint1b_t{0};
+              }
+            },
+            [&](int accum_m) {});
+      }
+      __syncthreads();
+    }
+    rematerializeThreadIds();
+
+    /////////////////////////////////////////////////////////////////////////////////////////////////
+    // GradV matmul
+    //
+    // grad_v[j_start:j_end] += attn_T @ do_i
+    /////////////////////////////////////////////////////////////////////////////////////////////////
+    constexpr bool kSingleIterationGradV =
+        kMaxK <= MatmulGradV::ThreadblockShape::kN;
+    for (int col = 0; col < (kSingleIterationGradV ? 1 : p.head_dim_value);
+         col += MatmulGradV::ThreadblockShape::kN) {
+      using Mma = typename MatmulGradV::Mma;
+      using AccumTileGmem = typename MatmulGradQ::AccumTileGmem;
+
+      cutlass::gemm::GemmCoord problem_size(
+          num_keys_in_block, p.head_dim_value - col, num_queries_in_block);
+      auto createEpilogueIter = [&]() {
+        return typename MatmulGradV::OutputTileIterator(
+            typename MatmulGradV::OutputTileIterator::Params{p.gV_strideM()},
+            p.grad_value_ptr + key_start * p.gV_strideM() + col,
+            {num_keys_in_block, p.head_dim_value - col},
+            thread_id);
+      };
+      typename Mma::IteratorB iterator_B(
+          {int32_t(p.gO_strideM)},
+          p.grad_output_ptr + query_start * p.gO_strideM + col,
+          {num_queries_in_block, p.head_dim_value - col},
+          thread_id,
+          no_offset);
+
+      // if dropout: dVj += (Pij.T * Zij) @ dOi
+      // otherwise:  dVj += Pij.T @ dOi
+      Mma mma(
+          // operand A: Pij.T
+          shared_storage.attn_shared_storage().accum_ref(),
+          // operand A_scale Zij.T:
+          // if we're using dropout, operand A is Pij_dropped.T = Pij.T * Zij.T
+          // which is computed on the fly as fragments of Pij.T are loaded in
+          shared_storage.zij().accum_ref(),
+          // operand B: dOi - which was loaded into shared memory previously
+          // when we computed dVj
+          shared_storage.mm_gradV().operand_B_ref(),
+          thread_id,
+          warp_id,
+          lane_id);
+
+      int storage_id = col / MatmulGradV::ThreadblockShape::kN;
+      AccumTileGmem gmem_tile{
+          p.workspace_gv + storage_id * AccumTileGmem::kElementsStored};
+      if (!kOutputInRF) {
+        if (isFirstQuery || !kNeedsAccumGradV) {
+          output_frags.gradV.clear();
+        } else {
+          gmem_tile.load(output_frags.gradV, thread_id);
+        }
+      }
+      mma.set_prologue_done(kPrologueGV);
+
+      auto gemm_k_iterations =
+          (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Compute threadblock-scoped matrix multiply-add
+      __syncthreads();
+
+      mma(gemm_k_iterations,
+          output_frags.gradV,
+          iterator_B,
+          output_frags.gradV);
+      __syncthreads();
+      if (kPrologueGV && !kSingleIterationGradV &&
+          col + MatmulGradV::ThreadblockShape::kN < p.head_dim_value) {
+        prologueGradV(col + MatmulGradV::ThreadblockShape::kN);
+      }
+
+      if (!kOutputInRF) {
+        if (kNeedsAccumGradV && !isLastQuery) {
+          gmem_tile.store(output_frags.gradV, thread_id);
+        } else {
+          accumulateInGmem<MatmulGradV>(
+              shared_storage.gradV_epilogue(),
+              output_frags.gradV,
+              createEpilogueIter(),
+              isFirstQuery || kNeedsAccumGradV,
+              warp_id,
+              lane_id);
+        }
+      }
+    }
+    __syncthreads();
+
+    /////////////////////////////////////////////////////////////////////////////////////////////////
+    // MatmulDOIVJ
+    /////////////////////////////////////////////////////////////////////////////////////////////////
+    {
+      using Mma = typename MatmulDOIVJ::Mma;
+      // do_i
+      typename Mma::IteratorA iterator_A(
+          {int32_t(p.gO_strideM)},
+          p.grad_output_ptr + query_start * p.gO_strideM,
+          {num_queries_in_block, p.head_dim_value},
+          thread_id,
+          no_offset);
+
+      // v_j.transpose(-2, -1)
+      typename Mma::IteratorB iterator_B(
+          {int32_t(p.v_strideM)},
+          p.value_ptr + key_start * p.v_strideM,
+          {p.head_dim_value, num_keys_in_block},
+          thread_id,
+          no_offset);
+
+      Mma mma(shared_storage.mm_doivj(), thread_id, warp_id, lane_id);
+      mma.set_prologue_done(kPrologueDOV);
+      mma.set_zero_outside_bounds(!skipBoundsChecks);
+
+      typename Mma::FragmentC accum;
+
+      accum.clear();
+
+      auto gemm_k_iterations =
+          (p.head_dim_value + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Compute threadblock-scoped matrix multiply-add
+      mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
+      __syncthreads();
+      if (kPrologueGQ) {
+        prologueGradQ(0);
+      }
+      if (kPrologueGK) {
+        prologueGradK(0);
+      }
+
+      int warp_idx_mn_0 =
+          warp_id % (Mma::Base::WarpCount::kM * Mma::Base::WarpCount::kN);
+      auto output_tile_coords = cutlass::MatrixCoord{
+          warp_idx_mn_0 % Mma::Base::WarpCount::kM,
+          warp_idx_mn_0 / Mma::Base::WarpCount::kM};
+      // TODO: This must be terribly inefficient. There must be a better way
+      // tmp [RF] <- (accum [RF] - Di [smem] ) * attn_T.T [smem]
+      // attn_shared_storage  [smem] <- tmp.T
+      // tmp_shared_storage [smem] <- tmp
+      {
+        using LambdaIterator = typename MatmulDOIVJ::AccumLambdaIterator;
+        auto lane_offset = LambdaIterator::get_lane_offset(
+            lane_id, warp_id, output_tile_coords);
+        // if dropout was used, compute dPij = dPij_dropped * Zij
+        if (kApplyDropout) {
+          LambdaIterator::iterateRows(
+              lane_offset,
+              [&](int accum_m) {},
+              [&](int accum_m, int accum_n, int idx) {
+                if (dropout_keep_mask_doivj[idx].get()) {
+                  accum[idx] *= dropout_scale;
+                } else {
+                  accum[idx] = 0;
+                }
+              },
+              [&](int accum_m) {});
+        }
+
+        auto attn_T = shared_storage.attn_shared_storage().accum_ref();
+#if 0
+        PRINT_B0_T0("doivj_dropped");
+        print_warp_accum<LambdaIterator>(accum, lane_offset, 4, 4);
+        PRINT_TENSOR4x4_T0_L0("attn_T", attn_T)
+#endif
+        accum_t current_di;
+        // dSij = (dPij - Di) * Pij
+        LambdaIterator::iterateRows(
+            lane_offset,
+            [&](int accum_m) { current_di = shared_storage.di()[accum_m]; },
+            [&](int accum_m, int accum_n, int idx) {
+              // TODO: Otherwise we can get nans as we
+              // might have infs here (only seen on f16 tho)
+              if (skipBoundsChecks ||
+                  (accum_m < num_queries_in_block &&
+                   accum_n < num_keys_in_block)) {
+                accum_t attn = attn_T.at({accum_n, accum_m});
+                accum[idx] = (accum[idx] - current_di) * attn;
+              } else {
+                accum[idx] = 0;
+              }
+            },
+            [&](int accum_m) {
+
+            });
+
+        // store bias gradient tile dBij to global memory,
+        // where dBij = dSij = Pij * (dPij - Di)
+        if (p.grad_bias_ptr != nullptr) {
+          typename MatmulDOIVJ::BiasGradEpilogue::OutputTileIterator
+              output_iter(
+                  typename MatmulDOIVJ::BiasGradEpilogue::OutputTileIterator::
+                      Params{p.gB_strideM},
+                  // grad_bias_ptr is offset to point at beginning of
+                  // matrix of shape (queries, keys) for a given
+                  // (batch_id, head_id) the pointer arithmetic here produces
+                  // a pointer to the start of the current tile within that
+                  // matrix
+                  p.grad_bias_ptr + query_start * p.gB_strideM + key_start,
+                  {num_queries_in_block, num_keys_in_block},
+                  thread_id);
+
+          // no-op epilogue operator - just casting and storing contents of
+          // accum to global memory
+          typename MatmulDOIVJ::BiasGradEpilogue::OutputOp output_op(
+              typename MatmulDOIVJ::BiasGradEpilogue::OutputOp::Params{1, 1});
+          typename MatmulDOIVJ::BiasGradEpilogue epilogue(
+              shared_storage.gradB_epilogue(), thread_id, warp_id, lane_id);
+          epilogue(output_op, output_iter, accum, output_iter);
+        }
+
+        accum = accum * scale;
+
+#if 0
+        PRINT_B0_T0("(doivj - di) * attn * scale");
+        print_warp_accum<LambdaIterator>(accum, lane_offset, 4, 4);
+#endif
+
+        __syncthreads();
+        if (!MatmulGradK::DefaultMmaFromSmem::kIsTransposedA) {
+          auto tmpT = shared_storage.tmpT_shared_storage().accum_ref();
+          // attn <- attn_T.T
+          LambdaIterator::iterateRows(
+              lane_offset,
+              [&](int accum_m) {},
+              [&](int accum_m, int accum_n, int idx) {
+                tmpT.at({accum_n, accum_m}) = scalar_t(accum[idx]);
+              },
+              [&](int accum_m) {});
+        }
+      }
+
+      MatmulDOIVJ::B2bGemm::accumToSmem(
+          shared_storage.tmp_shared_storage(),
+          accum,
+          lane_id,
+          output_tile_coords);
+      __syncthreads();
+    }
+    // Force `nvcc` to recompute values that depend on the variables just below
+    // to use less RF and prevent some spilling
+    p.head_dim = warp_uniform(p.head_dim);
+    p.k_strideM = warp_uniform(p.k_strideM);
+    rematerializeThreadIds();
+
+    /////////////////////////////////////////////////////////////////////////////////////////////////
+    // GradQ matmul
+    //
+    // grad_q[i_start:i_end] += tmp @ k_j
+    /////////////////////////////////////////////////////////////////////////////////////////////////
+    // Skip the loop & associated branches if we know at compile time the number
+    // of iterations
+    constexpr bool kSingleIterationGradQ =
+        kMaxK <= MatmulGradQ::ThreadblockShape::kN;
+    for (int col = 0; col < (kSingleIterationGradQ ? 1 : p.head_dim);
+         col += MatmulGradQ::ThreadblockShape::kN) {
+      using Mma = typename MatmulGradQ::Mma;
+      using AccumTileGmem = typename MatmulGradQ::AccumTileGmem;
+
+      cutlass::gemm::GemmCoord problem_size(
+          num_queries_in_block,
+          false ? MatmulGradQ::ThreadblockShape::kN : p.head_dim - col,
+          num_keys_in_block);
+
+      // k_j
+      typename Mma::IteratorB iterator_B(
+          {int32_t(p.k_strideM)},
+          p.key_ptr + key_start * p.k_strideM + col,
+          {problem_size.k(), problem_size.n()},
+          thread_id,
+          no_offset);
+
+      auto a = shared_storage.tmp_shared_storage().accum_ref();
+      Mma mma(
+          // operand A: dSij
+          shared_storage.tmp_shared_storage().accum_ref(),
+          // operand B: Kj
+          shared_storage.mm_gradQ().operand_B_ref(),
+          thread_id,
+          warp_id,
+          lane_id);
+
+      typename Mma::FragmentC accum;
+
+      int col_id = col / MatmulGradQ::ThreadblockShape::kN;
+      int num_cols = kSingleIterationGradQ
+          ? 1
+          : ceil_div(p.head_dim, MatmulGradQ::ThreadblockShape::kN);
+      int storage_id = (col_id + query_start / kBlockSizeI * num_cols);
+
+      if (p.num_splits_key_device() > 1) {
+        AtomicLock::acquire(
+            &p.workspace_gq[storage_id].lock,
+            p.split_key_device() + 1,
+            thread_id);
+        // Make sure we can see other block's output
+        __threadfence();
+      }
+
+      AccumTileGmem gmem_tile{&p.workspace_gq[storage_id].buffer[0]};
+      if (!kNeedsAccumGradQ ||
+          (p.num_splits_key_device() == 1 && key_start == 0)) {
+        // if we know we are the first to access it, we know it's only zeros.
+        // Avoids a load from gmem (and gmem init as well)
+        accum.clear();
+      } else {
+        gmem_tile.load(accum, thread_id);
+      }
+
+      auto gemm_k_iterations =
+          (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Compute threadblock-scoped matrix multiply-add
+      __syncthreads();
+      mma.set_prologue_done(kPrologueGQ);
+      mma(gemm_k_iterations, accum, iterator_B, accum);
+      __syncthreads();
+      bool isLastColumn = kSingleIterationGradQ ||
+          (col + MatmulGradQ::ThreadblockShape::kN >= p.head_dim);
+      if (kPrologueGQ && !isLastColumn) {
+        prologueGradQ(col + MatmulGradQ::ThreadblockShape::kN);
+      }
+
+      bool isLast = [&]() {
+        int32_t next_key = key_start + p.num_splits_key_device() * kBlockSizeJ;
+        if (p.num_keys <= next_key) {
+          return true;
+        }
+        if (query_start < getSmallestQueryForKey(p, next_key)) {
+          return true;
+        }
+        return false;
+      }();
+      // Output results
+      if (p.num_splits_key_device() > 1) {
+        int32_t numAddsSoFar = -1;
+        if (isLast && thread_id == 0) {
+          numAddsSoFar = atomicAdd(&p.workspace_gq[storage_id].counter, 1) +
+              1; // `atomicAdd` returns the old value
+        }
+        isLast = __syncthreads_or(
+            numAddsSoFar == getNumParallelBlocksForQuery(p, query_start));
+        assert(numAddsSoFar <= getNumParallelBlocksForQuery(p, query_start));
+      }
+      if (kNeedsAccumGradQ && !isLast) {
+        gmem_tile.store(accum, thread_id);
+        if (p.num_splits_key_device() > 1) {
+          // Make sure everyone wrote before we release the lock
+          __threadfence();
+          __syncthreads();
+          AtomicLock::release(&p.workspace_gq[storage_id].lock, thread_id);
+        }
+      } else {
+        // NOTE: We're not releasing the lock because no one is expected
+        // to come after us (we're the last one to write)
+        typename MatmulGradQ::OutputTileIterator output_it(
+            typename MatmulGradQ::OutputTileIterator::Params{p.gQ_strideM()},
+            p.grad_query_ptr + query_start * p.gQ_strideM() + col,
+            {problem_size.m(), problem_size.n()},
+            thread_id);
+        bool storage_contains_zeros = kNeedsAccumGradQ || key_start == 0 ||
+            (p.num_splits_key_device() > 1);
+        accumulateInGmem<MatmulGradQ>(
+            isLastColumn ? shared_storage.gradQ_epilogue_lastIter()
+                         : shared_storage.gradQ_epilogue(),
+            accum,
+            output_it,
+            storage_contains_zeros,
+            warp_id,
+            lane_id);
+      }
+    }
+    /////////////////////////////////////////////////////////////////////////////////////////////////
+    // GradK matmul
+    //
+    // grad_k[i_start:i_end] += tmp.transpose(-2, -1) @ q_i
+    /////////////////////////////////////////////////////////////////////////////////////////////////
+    rematerializeThreadIds();
+
+    constexpr bool kSingleIterationGradK =
+        kMaxK <= MatmulGradK::ThreadblockShape::kN;
+    for (int col = 0; col < (kSingleIterationGradK ? 1 : p.head_dim);
+         col += MatmulGradK::ThreadblockShape::kN) {
+      using Mma = typename MatmulGradK::Mma;
+      using AccumTileGmem = typename MatmulGradQ::AccumTileGmem;
+
+      cutlass::gemm::GemmCoord problem_size(
+          num_keys_in_block,
+          false ? MatmulGradK::ThreadblockShape::kN : p.head_dim - col,
+          num_queries_in_block);
+      auto createEpilogueIter = [&]() {
+        return typename MatmulGradK::OutputTileIterator(
+            typename MatmulGradK::OutputTileIterator::Params{p.gK_strideM()},
+            p.grad_key_ptr + key_start * p.gK_strideM() + col,
+            {num_keys_in_block,
+             false ? MatmulGradK::ThreadblockShape::kN : p.head_dim - col},
+            thread_id);
+      };
+
+      // q_i
+      typename Mma::IteratorB iterator_B(
+          {int32_t(p.q_strideM)},
+          p.query_ptr + query_start * p.q_strideM + col,
+          {problem_size.k(), problem_size.n()},
+          thread_id,
+          no_offset);
+
+      auto getTmp = [&](int) { return &shared_storage.tmp_shared_storage(); };
+      auto getTmpT = [&](int) { return &shared_storage.tmpT_shared_storage(); };
+      // this is basically:
+      // opA = kIsTransposedA ? getTmp() : getTmpT();
+      bool constexpr kIsTransposedA =
+          MatmulGradK::DefaultMmaFromSmem::kIsTransposedA;
+      auto& opA = *call_conditional<
+          kIsTransposedA,
+          decltype(getTmp),
+          decltype(getTmpT)>::apply(getTmp, getTmpT, 0);
+      Mma mma(
+          // operand A: dSij.T
+          opA.accum_ref(),
+          // operand B: Qi
+          shared_storage.mm_gradK().operand_B_ref(),
+          thread_id,
+          warp_id,
+          lane_id);
+
+      int storage_id = col / MatmulGradK::ThreadblockShape::kN;
+      AccumTileGmem gmem_tile{
+          p.workspace + storage_id * AccumTileGmem::kElementsStored};
+      if (!kOutputInRF) {
+        if (isFirstQuery || !kNeedsAccumGradK) {
+          output_frags.gradK.clear();
+        } else {
+          gmem_tile.load(output_frags.gradK, thread_id);
+        }
+      }
+      mma.set_prologue_done(kPrologueGK);
+
+      auto gemm_k_iterations =
+          (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Compute threadblock-scoped matrix multiply-add
+      __syncthreads();
+
+      mma(gemm_k_iterations,
+          output_frags.gradK,
+          iterator_B,
+          output_frags.gradK);
+      __syncthreads();
+      bool isLastColumn = kSingleIterationGradK ||
+          col + MatmulGradK::ThreadblockShape::kN >= p.head_dim;
+      if (kPrologueGK && !isLastColumn) {
+        prologueGradK(col + MatmulGradK::ThreadblockShape::kN);
+      }
+
+      if (kPrologueQK && isLastColumn) {
+        int32_t next_query, next_key;
+        incrIteration(p, query_start, key_start, next_query, next_key);
+        DISPATCH_BOOL(
+            next_key != key_start, kForceReloadK, ([&]() {
+              prologueQkNextIteration<kForceReloadK::value>(
+                  shared_storage, p, next_query, next_key, warp_id, lane_id);
+            }));
+      }
+
+      // Output results
+      if (!kOutputInRF) {
+        if (kNeedsAccumGradK && !isLastQuery) {
+          gmem_tile.store(output_frags.gradK, thread_id);
+        } else {
+          accumulateInGmem<MatmulGradK>(
+              isLastColumn ? shared_storage.gradK_epilogue_final()
+                           : shared_storage.gradK_epilogue(),
+              output_frags.gradK,
+              createEpilogueIter(),
+              isFirstQuery || kNeedsAccumGradK,
+              warp_id,
+              lane_id);
+          __syncthreads();
+        }
+      }
+    }
+  }
+
+  static CUTLASS_DEVICE int32_t getQueryStartShift(Params const& p) {
+    if (p.custom_mask_type == NoCustomMask && p.num_splits_key_device() > 1) {
+      return (p.split_key_device() * kBlockSizeI) % getQueryEnd(p);
+    }
+    return 0;
+  }
+
+  // Iteration order logic
+  static CUTLASS_DEVICE int32_t
+  getQueryStart(Params const& p, int32_t key_start) {
+    return getSmallestQueryForKey(p, key_start) + getQueryStartShift(p);
+  };
+  static CUTLASS_DEVICE int32_t getQueryEnd(Params const& p) {
+    return align_up(p.num_queries, kBlockSizeI);
+  };
+
+  static CUTLASS_DEVICE int32_t
+  getSmallestQueryForKey(Params const& p, int32_t key_start) {
+    if (p.custom_mask_type == CausalFromTopLeft) {
+      return (key_start / kBlockSizeI) * kBlockSizeI;
+    } else if (p.custom_mask_type == CausalFromBottomRight) {
+      int first_query =
+          cutlass::fast_max(0, key_start - p.num_keys + p.num_queries);
+      return (first_query / kBlockSizeI) * kBlockSizeI;
+    }
+    return 0;
+  };
+
+  // Returns how many kernel blocks will write to a given block in `grad_query`
+  // This is usually equal to the number of key splits, but can be different
+  // for instance in the causal case, or varying seqlen
+  static CUTLASS_DEVICE int32_t
+  getNumParallelBlocksForQuery(Params const& p, int32_t query_start) {
+    int16_t num_key_blocks = ceil_div(p.num_keys, kBlockSizeJ);
+    if (p.custom_mask_type == CausalFromTopLeft) {
+      int32_t last_key_for_block = query_start + kBlockSizeI - 1;
+      last_key_for_block = cutlass::fast_min(last_key_for_block, p.num_keys);
+      num_key_blocks = ceil_div(last_key_for_block, kBlockSizeJ);
+    } else if (p.custom_mask_type == CausalFromBottomRight) {
+      int32_t last_key_for_block =
+          query_start + (kBlockSizeI - 1) + (1 + p.num_keys - p.num_queries);
+      last_key_for_block = cutlass::fast_min(last_key_for_block, p.num_keys);
+      num_key_blocks = ceil_div(last_key_for_block, kBlockSizeJ);
+    }
+    return cutlass::fast_min(p.num_splits_key_device(), num_key_blocks);
+  };
+
+  // Returns the next block to process
+  static CUTLASS_DEVICE void incrIteration(
+      Params const& p,
+      int32_t query_start,
+      int32_t key_start,
+      int32_t& next_query,
+      int32_t& next_key) {
+    next_query = query_start + kBlockSizeI;
+    next_key = key_start;
+    auto query_shift = getQueryStartShift(p);
+    // Wrap around
+    if (query_shift) {
+      if (next_query >= p.num_queries) {
+        next_query = getSmallestQueryForKey(p, key_start);
+        return;
+      } else if (query_start < query_shift && query_shift <= next_query) {
+        // jump to next key
+      } else {
+        return;
+      }
+    } else {
+      if (next_query < p.num_queries) {
+        return;
+      }
+      // jump to next key
+    }
+    // Next key
+    next_key = key_start + p.num_splits_key_device() * kBlockSizeJ;
+    next_query = getQueryStart(p, next_key);
+  }
+
+  template <bool kForceReloadK>
+  static CUTLASS_DEVICE void prologueQkNextIteration(
+      SharedStorage& shared_storage,
+      Params const& p,
+      int32_t query_start,
+      int32_t key_start,
+      uint8_t warp_id,
+      uint8_t lane_id) {
+    if (query_start >= p.num_queries || key_start >= p.num_keys) {
+      return;
+    }
+
+    static constexpr bool kReloadK =
+        kForceReloadK || !MatmulQK::Mma::kSmemContainsEntireMat;
+    int thread_id = 32 * warp_id + lane_id;
+    typename MatmulQK::Mma::IteratorA iterator_A(
+        {int32_t(p.k_strideM)},
+        p.key_ptr + key_start * p.k_strideM,
+        {p.num_keys - key_start, p.head_dim},
+        thread_id,
+        cutlass::MatrixCoord{0, 0});
+
+    typename MatmulQK::Mma::IteratorB iterator_B(
+        {int32_t(p.q_strideM)},
+        p.query_ptr + query_start * p.q_strideM,
+        {p.head_dim, p.num_queries - query_start},
+        thread_id,
+        cutlass::MatrixCoord{0, 0});
+
+    MatmulQK::Mma::template prologue<kReloadK, true>(
+        shared_storage.mm_qk_k(),
+        shared_storage.mm_qk_q(),
+        iterator_A,
+        iterator_B,
+        thread_id,
+        p.head_dim);
+  }
+
+  template <bool skipBoundsChecks>
+  static CUTLASS_DEVICE void writeFragsToGmem(
+      SharedStorage& shared_storage,
+      OutputFragments& output_frags,
+      Params const& p,
+      int32_t key_start,
+      uint8_t warp_id,
+      uint8_t lane_id) {
+    uint16_t thread_id = 32 * warp_id + lane_id;
+    int32_t num_keys_in_block = skipBoundsChecks
+        ? MatmulQK::Mma::Shape::kM
+        : cutlass::fast_min(
+              (int32_t)MatmulQK::Mma::Shape::kM, p.num_keys - key_start);
+    typename MatmulGradV::OutputTileIterator outputV_it(
+        typename MatmulGradV::OutputTileIterator::Params{p.gV_strideM()},
+        p.grad_value_ptr + key_start * p.gV_strideM(),
+        {num_keys_in_block, p.head_dim_value},
+        thread_id);
+
+    accumulateInGmem<MatmulGradV>(
+        shared_storage.gradV_epilogue_final(),
+        output_frags.gradV,
+        outputV_it,
+        true,
+        warp_id,
+        lane_id);
+
+    typename MatmulGradK::OutputTileIterator outputK_it(
+        typename MatmulGradK::OutputTileIterator::Params{p.gK_strideM()},
+        p.grad_key_ptr + key_start * p.gK_strideM(),
+        {num_keys_in_block,
+         false ? MatmulGradK::ThreadblockShape::kN : p.head_dim},
+        thread_id);
+    accumulateInGmem<MatmulGradK>(
+        shared_storage.gradK_epilogue_final(),
+        output_frags.gradK,
+        outputK_it,
+        true,
+        warp_id,
+        lane_id);
+  }
+
+  template <typename MatmulT>
+  static CUTLASS_DEVICE void accumulateInGmem(
+      typename MatmulT::DefaultEpilogue::SharedStorage& epilogue_smem,
+      typename MatmulT::Mma::FragmentC const& accum,
+      typename MatmulT::OutputTileIterator output_it,
+      bool first,
+      uint8_t warp_id,
+      uint8_t lane_id) {
+    using DefaultEpilogue = typename MatmulT::DefaultEpilogue;
+    using DefaultOutputOp = typename MatmulT::DefaultOutputOp;
+    using Mma = typename MatmulT::Mma;
+    int thread_id = 32 * warp_id + lane_id;
+    DISPATCH_BOOL(
+        first, kIsFirst, ([&]() {
+          static constexpr auto ScaleType = kIsFirst::value
+              ? cutlass::epilogue::thread::ScaleType::Nothing
+              : cutlass::epilogue::thread::ScaleType::NoBetaScaling;
+          using EpilogueOutputOp =
+              typename cutlass::epilogue::thread::LinearCombination<
+                  typename DefaultOutputOp::ElementOutput,
+                  DefaultOutputOp::kCount,
+                  typename DefaultOutputOp::ElementAccumulator,
+                  typename DefaultOutputOp::ElementCompute,
+                  ScaleType>;
+          using Epilogue =
+              typename cutlass::epilogue::threadblock::EpiloguePipelined<
+                  typename DefaultEpilogue::Shape,
+                  typename Mma::Operator,
+                  DefaultEpilogue::kPartitionsK,
+                  typename MatmulT::OutputTileIterator,
+                  typename DefaultEpilogue::AccumulatorFragmentIterator,
+                  typename DefaultEpilogue::WarpTileIterator,
+                  typename DefaultEpilogue::SharedLoadIterator,
+                  EpilogueOutputOp,
+                  typename DefaultEpilogue::Padding,
+                  DefaultEpilogue::kFragmentsPerIteration,
+                  true // IterationsUnroll
+                  >;
+          EpilogueOutputOp rescale({1, 1});
+          Epilogue epilogue(epilogue_smem, thread_id, warp_id, lane_id);
+          epilogue(rescale, output_it, accum, output_it);
+        }));
+  }
+
+  template <int kElementsPerAccess>
+  static CUTLASS_DEVICE void computeDelta(
+      Params const& p,
+      int32_t query_start,
+      uint8_t warp_id,
+      uint8_t lane_id) {
+    // Each thread computes one value for Delta
+    // Depending on warp configuration, we might have multiple
+    // threads of the same warp working on the same row
+    using AccessType = cutlass::Array<scalar_t, kElementsPerAccess>;
+    static_assert(kNumThreads >= kBlockSizeI, "");
+    static constexpr int kNumThreadsPerLine = kNumThreads / kBlockSizeI;
+    int16_t thread_id = 32 * warp_id + lane_id;
+
+    int16_t laneFirstCol = kElementsPerAccess * (lane_id % kNumThreadsPerLine);
+    int16_t laneRow = thread_id / kNumThreadsPerLine;
+    bool rowPred = (query_start + laneRow) < p.num_queries;
+    bool pred = rowPred;
+
+    // on windows, previous syntax __restrict__ AccessType*
+    // resulted in error: "restrict" is not allowed
+    const AccessType* __restrict__ grad_output_ptr =
+        reinterpret_cast<const AccessType*>(
+            p.grad_output_ptr + (query_start + laneRow) * p.gO_strideM +
+            laneFirstCol);
+    const AccessType* __restrict__ output_ptr =
+        reinterpret_cast<const AccessType*>(
+            p.output_ptr + (query_start + laneRow) * p.o_strideM() +
+            laneFirstCol);
+
+    static constexpr int64_t kMaxIters =
+        kMaxK / (kElementsPerAccess * kNumThreadsPerLine);
+    constexpr int kPipelineStages = 2;
+    accum_t delta_value = accum_t(0);
+    using GlobalLoad =
+        cutlass::arch::global_load<AccessType, sizeof(AccessType)>;
+    AccessType frag_grad_output[kPipelineStages];
+    AccessType frag_output[kPipelineStages];
+
+    auto loadAndIncrement = [&](int ld_pos, bool is_valid) {
+      frag_grad_output[ld_pos].clear();
+      frag_output[ld_pos].clear();
+      GlobalLoad(frag_grad_output[ld_pos], grad_output_ptr, is_valid);
+      GlobalLoad(frag_output[ld_pos], output_ptr, is_valid);
+      grad_output_ptr += kNumThreadsPerLine;
+      output_ptr += kNumThreadsPerLine;
+    };
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int iter = 0; iter < kPipelineStages - 1; ++iter) {
+      int ld_pos = iter % kPipelineStages;
+      pred = pred &&
+          (laneFirstCol + iter * kElementsPerAccess * kNumThreadsPerLine) <
+              p.head_dim_value;
+      loadAndIncrement(ld_pos, pred);
+    }
+    auto columnIteration = [&](int iter) {
+      // Load for next iter
+      int ld_pos = (iter + kPipelineStages - 1) % kPipelineStages;
+      pred = pred &&
+          (laneFirstCol +
+           (iter + kPipelineStages - 1) * kElementsPerAccess *
+               kNumThreadsPerLine) < p.head_dim_value;
+      loadAndIncrement(ld_pos, pred);
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < AccessType::kElements; ++i) {
+        delta_value += accum_t(frag_output[iter % kPipelineStages][i]) *
+            accum_t(frag_grad_output[iter % kPipelineStages][i]);
+      }
+    };
+
+    // If we have a small lower-bound for K, we can unroll the loop
+    if (kMaxK <= 256) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int iter = 0; iter < kMaxIters; ++iter) {
+        columnIteration(iter);
+      }
+    } else {
+      int num_iters =
+          ceil_div(p.head_dim_value, kElementsPerAccess * kNumThreadsPerLine) *
+          (kElementsPerAccess * kNumThreadsPerLine);
+      for (int iter = 0; iter < num_iters; ++iter) {
+        columnIteration(iter);
+      }
+    }
+
+    // Reduce between workers
+    static_assert(
+        kNumThreadsPerLine == 1 || kNumThreadsPerLine == 2 ||
+            kNumThreadsPerLine == 4,
+        "");
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < kNumThreadsPerLine; i *= 2) {
+      delta_value = delta_value + __shfl_xor_sync(0xffffffff, delta_value, i);
+    }
+
+    // Store in gmem
+    if (rowPred) {
+      p.delta_ptr[query_start + laneRow] = delta_value;
+    }
+  }
+};
+
+template <typename AK>
+__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
+    attention_kernel_backward_batched_impl(typename AK::Params p) {
+  if (!p.advance_to_block()) {
+    return;
+  }
+  AK::attention_kernel(p);
+}
+
+template <typename AK>
+__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
+    attention_kernel_backward_batched(typename AK::Params params);
diff --git a/third_party/fused_multi_head_attention/kernel_forward.h b/third_party/fused_multi_head_attention/kernel_forward.h
new file mode 100644
index 0000000000..ed4e167759
--- /dev/null
+++ b/third_party/fused_multi_head_attention/kernel_forward.h
@@ -0,0 +1,1322 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#ifdef HAS_PYTORCH
+#include <ATen/cuda/CUDAGeneratorImpl.h>
+#include <ATen/cuda/CUDAGraphsUtils.cuh>
+#endif
+
+#include <curand_kernel.h>
+#include <cmath>
+#include <cinttypes>
+#include <vector>
+
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "debug_utils.h"
+#include "epilogue/epilogue_pipelined.h"
+#include "epilogue/epilogue_rescale_output.h"
+#include "gemm/custom_mma.h"
+#include "gemm/find_default_mma.h"
+#include "gemm/mma_from_smem.h"
+#include "gemm_kernel_utils.h"
+#include "transform/tile_smem_loader.h"
+
+using namespace gemm_kernel_utils;
+
+namespace {
+template <typename scalar_t, typename Arch>
+constexpr int getWarpsPerSmFw() {
+  return (
+      Arch::kMinComputeCapability >= 80 &&
+              !cutlass::platform::is_same<scalar_t, float>::value
+          ? 16
+          : 12);
+}
+static CUTLASS_DEVICE float atomicMaxFloat(float* addr, float value) {
+  // source: https://stackoverflow.com/a/51549250
+  return (value >= 0)
+      ? __int_as_float(atomicMax((int*)addr, __float_as_int(value)))
+      : __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
+}
+} // namespace
+
+// If ToBatchHookType_ is supplied other than this default (which is
+// never the case in the xformers library) then the user is
+// defining the logic which each block uses to find its data to work on,
+// with the advance_to_batch function with the following signature.
+// It should return false if there is no work to do for this block.
+// In general this will not work with saving for backward due to fixed layout
+// for logsumexp and incompatible rngs for dropout, so is likely only useful for
+// custom inference.
+struct DefaultToBatchHook {
+  template <typename Params>
+  CUTLASS_DEVICE static bool advance_to_batch(
+      Params&,
+      int64_t& /* q_start */,
+      int64_t& /* k_start */) {
+    return true;
+  }
+};
+
+template <
+    // The datatype of Q/K/V
+    typename scalar_t_,
+    // Architecture we are targeting (eg `cutlass::arch::Sm80`)
+    typename ArchTag,
+    // If Q/K/V are correctly aligned in memory and we can run a fast kernel
+    bool isAligned_,
+    int kQueriesPerBlock_,
+    int kKeysPerBlock_,
+    // upperbound on `max(value.shape[-1], query.shape[-1])`
+    int kMaxK_ = (int)cutlass::platform::numeric_limits<uint32_t>::max(),
+    // This is quite slower on V100 for some reason
+    // Set to false if you know at compile-time you will never need dropout
+    bool kSupportsDropout_ = true,
+    bool kSupportsBias_ = true,
+    typename ToBatchHookType_ = DefaultToBatchHook>
+struct AttentionKernel {
+  enum CustomMaskType {
+    NoCustomMask = 0,
+    CausalFromTopLeft = 1,
+    CausalFromBottomRight = 2,
+    NumCustomMaskTypes,
+  };
+
+  using scalar_t = scalar_t_;
+  using accum_t = float;
+  using lse_scalar_t = float;
+  using output_t = scalar_t;
+  // Accumulator between 2 iterations
+  // Using `accum_t` improves perf on f16 at the cost of
+  // numerical errors
+  using output_accum_t = accum_t;
+  static constexpr bool kSupportsDropout = kSupportsDropout_;
+  static constexpr bool kSupportsBias = kSupportsBias_;
+  static constexpr int kKeysPerBlock = kKeysPerBlock_;
+  static constexpr int kQueriesPerBlock = kQueriesPerBlock_;
+  static constexpr int kMaxK = kMaxK_;
+  static constexpr bool kIsAligned = isAligned_;
+  static constexpr bool kSingleValueIteration = kMaxK <= kKeysPerBlock;
+  static constexpr int32_t kAlignLSE = 32; // block size of backward
+  static constexpr bool kIsHalf = cutlass::sizeof_bits<scalar_t>::value == 16;
+  static constexpr bool kPreloadV =
+      ArchTag::kMinComputeCapability >= 80 && kIsHalf;
+  static constexpr bool kKeepOutputInRF = kSingleValueIteration;
+  static constexpr bool kNeedsOutputAccumulatorBuffer = !kKeepOutputInRF &&
+      !cutlass::platform::is_same<output_accum_t, output_t>::value;
+
+  static_assert(kQueriesPerBlock % 32 == 0, "");
+  static_assert(kKeysPerBlock % 32 == 0, "");
+  static constexpr int kNumWarpsPerBlock =
+      kQueriesPerBlock * kKeysPerBlock / (32 * 32);
+  static constexpr int kWarpSize = 32;
+
+  // Launch bounds
+  static constexpr int kNumThreads = kWarpSize * kNumWarpsPerBlock;
+  static constexpr int kMinBlocksPerSm =
+      getWarpsPerSmFw<scalar_t, ArchTag>() / kNumWarpsPerBlock;
+
+  struct Params {
+    // Input tensors
+    scalar_t* query_ptr = nullptr; // [num_queries, num_heads, head_dim]
+    scalar_t* key_ptr = nullptr; // [num_keys, num_heads, head_dim]
+    scalar_t* value_ptr = nullptr; // [num_keys, num_heads, head_dim_value]
+    scalar_t* attn_bias_ptr = nullptr; // [num_heads, num_queries, num_keys]
+    int32_t* seqstart_q_ptr = nullptr;
+    int32_t* seqstart_k_ptr = nullptr;
+
+    int32_t* seqlen_k_ptr = nullptr;
+    uint32_t causal_diagonal_offset = 0;
+
+    // Output tensors
+    output_t* output_ptr = nullptr; // [num_queries, num_heads, head_dim_value]
+    // [num_queries, num_heads, head_dim_value]
+    output_accum_t* output_accum_ptr = nullptr;
+    // [num_heads, num_queries] - can be null
+    lse_scalar_t* logsumexp_ptr = nullptr;
+
+    // Scale
+    accum_t scale = 0.0;
+
+    // Dimensions/strides
+    int32_t head_dim = 0;
+    int32_t head_dim_value = 0;
+    int32_t num_queries = 0;
+    int32_t num_keys = 0;
+    int32_t num_keys_absolute = 0;
+
+    uint8_t custom_mask_type = NoCustomMask;
+
+    int32_t q_strideM = 0;
+    int32_t k_strideM = 0;
+    int32_t v_strideM = 0;
+    int32_t bias_strideM = 0;
+
+    int32_t o_strideM = 0;
+
+    // Everything below is only used in `advance_to_block`
+    // and shouldn't use registers
+    int32_t q_strideH = 0;
+    int32_t k_strideH = 0;
+    int32_t v_strideH = 0;
+    int64_t bias_strideH = 0;
+
+    int64_t q_strideB = 0;
+    int64_t k_strideB = 0;
+    int64_t v_strideB = 0;
+    int64_t bias_strideB = 0;
+
+    int32_t num_batches = 0;
+    int32_t num_heads = 0;
+
+    // dropout
+    bool use_dropout = false;
+    unsigned long long dropout_batch_head_rng_offset = 0;
+    float dropout_prob = 0.0f;
+#ifdef HAS_PYTORCH
+    at::PhiloxCudaState rng_engine_inputs = at::PhiloxCudaState(0, 0);
+#endif
+
+    // Moves pointers to what we should process
+    // Returns "false" if there is no work to do
+    CUTLASS_DEVICE bool advance_to_block() {
+      auto batch_id = blockIdx.z;
+      auto head_id = blockIdx.y;
+      auto query_start = blockIdx.x * kQueriesPerBlock;
+
+      auto lse_dim = ceil_div((int32_t)num_queries, kAlignLSE) * kAlignLSE;
+
+      if (kSupportsDropout) {
+        dropout_batch_head_rng_offset =
+            batch_id * num_heads * num_queries * num_keys +
+            head_id * num_queries * num_keys;
+      }
+
+      int64_t q_start = 0, k_start = 0;
+      // Advance to current batch - in case of different sequence lengths
+      constexpr bool kToBatchHook =
+          !cutlass::platform::is_same<ToBatchHookType_, DefaultToBatchHook>::
+              value;
+      if (kToBatchHook) {
+        // Call out to a custom implementation.
+        if (!ToBatchHookType_::advance_to_batch(*this, q_start, k_start)) {
+          return false;
+        }
+      } else if (seqstart_q_ptr != nullptr) {
+        assert(seqstart_k_ptr != nullptr);
+        seqstart_q_ptr += batch_id;
+
+        q_start = seqstart_q_ptr[0];
+        int64_t q_next_start = seqstart_q_ptr[1];
+        int64_t k_end;
+        seqstart_k_ptr += batch_id;
+
+        if (seqlen_k_ptr) {
+          k_start = seqstart_k_ptr[0];
+          k_end = k_start + seqlen_k_ptr[batch_id];
+        } else {
+          k_start = seqstart_k_ptr[0];
+          k_end = seqstart_k_ptr[1];
+        }
+
+        num_queries = q_next_start - q_start;
+        num_keys = k_end - k_start;
+
+        if (query_start >= num_queries) {
+          return false;
+        }
+      } else {
+        query_ptr += batch_id * q_strideB;
+        key_ptr += batch_id * k_strideB;
+        value_ptr += batch_id * v_strideB;
+        output_ptr += int64_t(batch_id * num_queries) * o_strideM;
+        if (output_accum_ptr != nullptr) {
+          output_accum_ptr +=
+              int64_t(batch_id * num_queries) * (head_dim_value * num_heads);
+        }
+        q_start = 0;
+        k_start = 0;
+      }
+
+      // Advance to the current batch / head / query_start
+      query_ptr += (q_start + query_start) * q_strideM + head_id * q_strideH;
+      key_ptr += k_start * k_strideM + head_id * k_strideH;
+
+      value_ptr += k_start * v_strideM + head_id * v_strideH;
+      output_ptr +=
+          int64_t(q_start + query_start) * o_strideM + head_id * head_dim_value;
+
+      if (kSupportsBias && attn_bias_ptr != nullptr) {
+        attn_bias_ptr += (batch_id * bias_strideB) + (head_id * bias_strideH);
+      }
+      if (output_accum_ptr != nullptr) {
+        output_accum_ptr +=
+            int64_t(q_start + query_start) * (head_dim_value * num_heads) +
+            head_id * head_dim_value;
+      } else {
+        // Accumulate directly in the destination buffer (eg for f32)
+        output_accum_ptr = (accum_t*)output_ptr;
+      }
+
+      if (logsumexp_ptr != nullptr) {
+        // lse[batch_id, head_id, query_start]
+        logsumexp_ptr +=
+            batch_id * lse_dim * num_heads + head_id * lse_dim + query_start;
+      }
+
+      // Custom masking
+      if (custom_mask_type == CausalFromBottomRight) {
+        causal_diagonal_offset = num_keys - num_queries;
+      }
+      // We use num_keys_absolute to index into the rng_state
+      // We need this index to match between forward and backwards
+      num_keys_absolute = num_keys;
+      if (custom_mask_type == CausalFromTopLeft ||
+          custom_mask_type == CausalFromBottomRight) {
+        // the bottom row of the current block is query_start + kQueriesPerBlock
+        // the last active key is then query_start + causal_diagonal_offset +
+        // kQueriesPerBlock so num_keys is the min between actual num_keys and
+        // this to avoid extra computations
+        num_keys = cutlass::fast_min(
+            int32_t(query_start + causal_diagonal_offset + kQueriesPerBlock),
+            num_keys);
+      }
+
+      num_queries -= query_start;
+      num_batches = 0; // no longer used after
+
+      // If num_queries == 1, and there is only one key head we're wasting
+      // 15/16th of tensor core compute In that case :
+      //  - we only launch kernels for head_id % kQueriesPerBlock == 0
+      //  - we iterate over heads instead of queries (strideM = strideH)
+      if (num_queries == 1 && k_strideH == 0 && v_strideH == 0) {
+        if (head_id % kQueriesPerBlock != 0)
+          return false;
+        q_strideM = q_strideH;
+        num_queries = num_heads;
+        num_heads = 1; // unused but here for intent
+        // remove causal since n_query = 1
+        // otherwise, offset would change with head !
+        custom_mask_type = NoCustomMask;
+        o_strideM = head_dim_value;
+      }
+
+      // Make sure the compiler knows these variables are the same on all
+      // the threads of the warp.
+      // Only worth doing if they could have been modified above.
+      query_ptr = warp_uniform(query_ptr);
+      key_ptr = warp_uniform(key_ptr);
+      value_ptr = warp_uniform(value_ptr);
+      if (kSupportsBias) {
+        attn_bias_ptr = warp_uniform(attn_bias_ptr);
+      }
+      output_ptr = warp_uniform(output_ptr);
+      output_accum_ptr = warp_uniform(output_accum_ptr);
+      logsumexp_ptr = warp_uniform(logsumexp_ptr);
+      num_queries = warp_uniform(num_queries);
+      num_keys = warp_uniform(num_keys);
+      num_heads = warp_uniform(num_heads);
+      o_strideM = warp_uniform(o_strideM);
+      custom_mask_type = warp_uniform(custom_mask_type);
+      return true;
+    }
+
+    __host__ dim3 getBlocksGrid() const {
+      return dim3(
+          ceil_div(num_queries, (int32_t)kQueriesPerBlock),
+          num_heads,
+          num_batches);
+    }
+
+    __host__ dim3 getThreadsGrid() const {
+      return dim3(kWarpSize, kNumWarpsPerBlock, 1);
+    }
+  };
+
+  struct MM0 {
+    /*
+      In this first matmul, we compute a block of `Q @ K.T`.
+      While the calculation result is still hot in registers, we update
+      `mi`, `m_prime`, `s_prime` in shared-memory, and then store this value
+      into a shared-memory ("AccumulatorSharedStorage") that is used later as
+      operand A for the second matmul (see MM1)
+    */
+    using GemmType = DefaultGemmType<ArchTag, scalar_t>;
+
+    using OpClass = typename GemmType::OpClass;
+    using DefaultConfig =
+        typename cutlass::gemm::device::DefaultGemmConfiguration<
+            OpClass,
+            ArchTag,
+            scalar_t,
+            scalar_t,
+            scalar_t, // ElementC
+            accum_t // ElementAccumulator
+            >;
+    static constexpr int kAlignmentA =
+        kIsAligned ? DefaultConfig::kAlignmentA : GemmType::kMinimumAlignment;
+    static constexpr int kAlignmentB =
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment;
+    using ThreadblockShape = cutlass::gemm::
+        GemmShape<kQueriesPerBlock, kKeysPerBlock, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+    using DefaultMma = typename cutlass::gemm::threadblock::FindDefaultMma<
+        scalar_t, // ElementA,
+        cutlass::layout::RowMajor, // LayoutA,
+        kAlignmentA,
+        scalar_t, // ElementB,
+        cutlass::layout::ColumnMajor, // LayoutB,
+        kAlignmentB,
+        accum_t,
+        cutlass::layout::RowMajor, // LayoutC,
+        OpClass,
+        ArchTag, // ArchTag
+        ThreadblockShape, // ThreadblockShape
+        WarpShape, // WarpShape
+        typename GemmType::InstructionShape, // InstructionShape
+        ArchTag::kMinComputeCapability >= 80 && kIsHalf
+            ? 4
+            : DefaultConfig::kStages,
+        typename GemmType::Operator // Operator
+        >::DefaultMma;
+    using MmaCore = typename DefaultMma::MmaCore;
+    using IteratorA = typename DefaultMma::IteratorA;
+    using IteratorB = typename DefaultMma::IteratorB;
+    using DefaultThreadblockMma = typename DefaultMma::ThreadblockMma;
+    using Mma = typename cutlass::platform::conditional<
+        kSingleValueIteration,
+        typename MakeCustomMma<DefaultThreadblockMma, kMaxK>::Mma,
+        DefaultThreadblockMma>::type;
+    using AccumLambdaIterator = typename DefaultMmaAccumLambdaIterator<
+        typename Mma::Operator::IteratorC,
+        accum_t,
+        kWarpSize>::Iterator;
+    static_assert(
+        MmaCore::WarpCount::kM * MmaCore::WarpCount::kN *
+                MmaCore::WarpCount::kK ==
+            kNumWarpsPerBlock,
+        "");
+
+    // used for efficient load of bias tile Bij from global to shared memory
+    using BiasLoader = TileSmemLoader<
+        scalar_t,
+        cutlass::MatrixShape<kQueriesPerBlock, kKeysPerBlock>,
+        MmaCore::kThreads,
+        // input restriction: kv_len has to be a multiple of this value
+        128 / cutlass::sizeof_bits<scalar_t>::value>;
+
+    // Epilogue to store to shared-memory in a format that we can use later for
+    // the second matmul
+    using B2bGemm = typename cutlass::gemm::threadblock::B2bGemm<
+        typename Mma::Operator::IteratorC,
+        typename Mma::Operator,
+        scalar_t,
+        WarpShape,
+        ThreadblockShape>;
+    using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage;
+  };
+
+  struct MM1 {
+    /**
+      Second matmul: perform `attn @ V` where `attn` is the attention (not
+      normalized) and stored in shared memory
+    */
+    using GemmType = DefaultGemmType<ArchTag, scalar_t>;
+
+    using OpClass = typename GemmType::OpClass;
+    using DefaultConfig =
+        typename cutlass::gemm::device::DefaultGemmConfiguration<
+            OpClass,
+            ArchTag,
+            scalar_t,
+            scalar_t,
+            output_accum_t, // ElementC
+            accum_t // ElementAccumulator
+            >;
+    static constexpr int kAlignmentA = DefaultConfig::kAlignmentA; // from smem
+    static constexpr int kAlignmentB =
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment;
+    using ThreadblockShape = cutlass::gemm::
+        GemmShape<kQueriesPerBlock, kKeysPerBlock, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+    using InstructionShape = typename GemmType::InstructionShape;
+
+    using LayoutB = cutlass::layout::RowMajor;
+    using DefaultGemm = cutlass::gemm::kernel::DefaultGemm<
+        scalar_t, // ElementA,
+        cutlass::layout::RowMajor, // LayoutA,
+        kAlignmentA,
+        scalar_t, // ElementB,
+        LayoutB, // LayoutB,
+        kAlignmentB,
+        output_accum_t,
+        cutlass::layout::RowMajor, // LayoutC,
+        accum_t,
+        OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        typename GemmType::InstructionShape,
+        typename DefaultConfig::EpilogueOutputOp,
+        void, // ThreadblockSwizzle - not used
+        ArchTag::kMinComputeCapability >= 80 && kIsHalf
+            ? 4
+            : DefaultConfig::kStages,
+        false, // SplitKSerial
+        typename GemmType::Operator>;
+
+    using WarpIteratorA = typename cutlass::gemm::threadblock::
+        DefaultWarpIteratorAFromSharedMemory<
+            typename DefaultGemm::Mma::Policy::Operator::Shape, // WarpShape
+            typename DefaultGemm::Mma::Policy::Operator::InstructionShape,
+            typename DefaultGemm::Mma::Policy::Operator::IteratorA,
+            typename DefaultGemm::Mma::Policy>::WarpIterator;
+    using DefaultMmaFromSmem =
+        typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            MM0::AccumulatorSharedStorage::Shape::kN, // kMaxK
+            WarpIteratorA,
+            false>; // kScaleOperandA
+    using Mma = typename DefaultMmaFromSmem::Mma;
+    using IteratorB = typename Mma::IteratorB;
+    using WarpCount = typename Mma::WarpCount;
+    static_assert(
+        WarpCount::kM * WarpCount::kN * WarpCount::kK == kNumWarpsPerBlock,
+        "");
+
+    using DefaultEpilogue = typename DefaultGemm::Epilogue;
+    using OutputTileIterator =
+        typename cutlass::epilogue::threadblock::PredicatedTileIterator<
+            typename DefaultEpilogue::OutputTileIterator::ThreadMap,
+            output_t>;
+    using OutputTileIteratorAccum =
+        typename cutlass::epilogue::threadblock::PredicatedTileIterator<
+            typename DefaultEpilogue::OutputTileIterator::ThreadMap,
+            output_accum_t>;
+  };
+
+  static constexpr int64_t kAlignmentQ = MM0::kAlignmentA;
+  static constexpr int64_t kAlignmentK = MM0::kAlignmentB;
+  static constexpr int64_t kAlignmentV = 1;
+
+  // Shared storage - depends on kernel params
+  struct ScalingCoefs {
+    cutlass::Array<accum_t, kQueriesPerBlock> m_prime;
+    cutlass::Array<accum_t, kQueriesPerBlock> s_prime;
+    cutlass::Array<accum_t, kQueriesPerBlock> mi;
+    cutlass::Array<accum_t, kQueriesPerBlock> out_rescale;
+    cutlass::Array<accum_t, kQueriesPerBlock * MM0::MmaCore::WarpCount::kN>
+        addition_storage;
+  };
+
+  struct SharedStorageEpilogueAtEnd : ScalingCoefs {
+    struct SharedStorageAfterMM0 {
+      // Everything here might be overwritten during MM0
+      union {
+        typename MM0::BiasLoader::SmemTile bias;
+        typename MM0::AccumulatorSharedStorage si;
+      };
+      typename MM1::Mma::SharedStorage mm1;
+    };
+
+    union {
+      typename MM0::Mma::SharedStorage mm0;
+      SharedStorageAfterMM0 after_mm0;
+      typename MM1::DefaultEpilogue::SharedStorage epilogue;
+    };
+
+    CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage&
+    epilogue_shared_storage() {
+      return epilogue;
+    }
+  };
+
+  struct SharedStorageEpilogueInLoop : ScalingCoefs {
+    struct SharedStorageAfterMM0 {
+      // Everything here might be overwritten during MM0
+      union {
+        typename MM0::BiasLoader::SmemTile bias;
+        typename MM0::AccumulatorSharedStorage si;
+      };
+      typename MM1::Mma::SharedStorage mm1;
+      typename MM1::DefaultEpilogue::SharedStorage epilogue;
+    };
+
+    union {
+      typename MM0::Mma::SharedStorage mm0;
+      SharedStorageAfterMM0 after_mm0;
+    };
+
+    CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage&
+    epilogue_shared_storage() {
+      return after_mm0.epilogue;
+    }
+  };
+
+  using SharedStorage = typename cutlass::platform::conditional<
+      kSingleValueIteration || kKeepOutputInRF,
+      SharedStorageEpilogueAtEnd,
+      SharedStorageEpilogueInLoop>::type;
+
+  static bool __host__ check_supported(Params const& p) {
+    CHECK_ALIGNED_PTR(p.query_ptr, kAlignmentQ);
+    CHECK_ALIGNED_PTR(p.key_ptr, kAlignmentK);
+    CHECK_ALIGNED_PTR(p.value_ptr, kAlignmentV);
+    if (kSupportsBias) {
+      CHECK_ALIGNED_PTR(p.attn_bias_ptr, kAlignmentQ);
+      XFORMERS_CHECK(
+          p.num_batches <= 1 || p.bias_strideB % kAlignmentQ == 0,
+          "attn_bias is not correctly aligned (strideB)");
+      XFORMERS_CHECK(
+          p.num_heads <= 1 || p.bias_strideH % kAlignmentQ == 0,
+          "attn_bias is not correctly aligned (strideH)");
+      XFORMERS_CHECK(
+          p.bias_strideM % kAlignmentQ == 0,
+          "attn_bias is not correctly aligned");
+    }
+    XFORMERS_CHECK(
+        p.q_strideM % kAlignmentQ == 0,
+        "query is not correctly aligned (strideM)");
+    XFORMERS_CHECK(
+        p.k_strideM % kAlignmentK == 0,
+        "key is not correctly aligned (strideM)");
+    XFORMERS_CHECK(
+        p.v_strideM % kAlignmentV == 0,
+        "value is not correctly aligned (strideM)");
+    XFORMERS_CHECK(
+        p.num_heads <= 1 || p.q_strideH % kAlignmentQ == 0,
+        "query is not correctly aligned (strideH)");
+    XFORMERS_CHECK(
+        p.num_heads <= 1 || p.k_strideH % kAlignmentK == 0,
+        "key is not correctly aligned (strideH)");
+    XFORMERS_CHECK(
+        p.num_heads <= 1 || p.v_strideH % kAlignmentV == 0,
+        "value is not correctly aligned (strideH)");
+    XFORMERS_CHECK(
+        p.custom_mask_type < NumCustomMaskTypes,
+        "invalid value for `custom_mask_type`");
+    return true;
+  }
+
+  static void CUTLASS_DEVICE attention_kernel(Params& p) {
+    // In this block, we will only ever:
+    // - read query[query_start:query_end, :]
+    // - write to output[query_start:query_end, :]
+
+    extern __shared__ char smem_buffer[];
+    SharedStorage& shared_storage = *((SharedStorage*)smem_buffer);
+    auto& m_prime = shared_storage.m_prime;
+    auto& s_prime = shared_storage.s_prime;
+    auto& mi = shared_storage.mi;
+    auto& out_rescale = shared_storage.out_rescale;
+    const uint32_t query_start = blockIdx.x * kQueriesPerBlock;
+
+    static_assert(kQueriesPerBlock < kNumWarpsPerBlock * kWarpSize, "");
+    if (thread_id() < kQueriesPerBlock) {
+      s_prime[thread_id()] = accum_t(0);
+      out_rescale[thread_id()] = accum_t(1.0);
+      m_prime[thread_id()] =
+          -cutlass::platform::numeric_limits<accum_t>::infinity();
+      mi[thread_id()] = -cutlass::platform::numeric_limits<accum_t>::infinity();
+    }
+    typename MM1::Mma::FragmentC accum_o;
+    accum_o.clear();
+
+    auto createOutputIter = [&](int col) -> typename MM1::OutputTileIterator {
+      using OutputTileIterator = typename MM1::OutputTileIterator;
+      return OutputTileIterator(
+          typename OutputTileIterator::Params{(int32_t)p.o_strideM},
+          p.output_ptr,
+          typename OutputTileIterator::TensorCoord{
+              p.num_queries, p.head_dim_value},
+          thread_id(),
+          {0, col});
+    };
+
+    auto createOutputAccumIter = [&](int col) ->
+        typename MM1::OutputTileIteratorAccum {
+          using OutputTileIteratorAccum = typename MM1::OutputTileIteratorAccum;
+          return OutputTileIteratorAccum(
+              typename OutputTileIteratorAccum::Params{
+                  (int32_t)(p.head_dim_value * p.num_heads)},
+              p.output_accum_ptr,
+              typename OutputTileIteratorAccum::TensorCoord{
+                  p.num_queries, p.head_dim_value},
+              thread_id(),
+              {0, col});
+        };
+
+#ifdef HAS_PYTORCH
+    curandStatePhilox4_32_10_t curand_state_init;
+    if (kSupportsDropout && p.use_dropout) {
+      const auto seeds = at::cuda::philox::unpack(p.rng_engine_inputs);
+
+      // each element of the attention matrix P with shape
+      // (batch_sz, n_heads, n_queries, n_keys) is associated with a single
+      // offset in RNG sequence. we initialize the RNG state with offset that
+      // starts at the beginning of a (n_queries, n_keys) matrix for this
+      // block's batch_id and head_id
+      // initializing rng state is very expensive, so we run once per kernel,
+      // rather than once per iteration. each iteration takes a copy of the
+      // initialized RNG state and offsets it as needed.
+      curand_init(
+          std::get<0>(seeds),
+          0,
+          std::get<1>(seeds) + p.dropout_batch_head_rng_offset,
+          &curand_state_init);
+    }
+#endif
+
+    // Iterate through keys
+    for (int32_t iter_key_start = 0; iter_key_start < p.num_keys;
+         iter_key_start += kKeysPerBlock) {
+      int32_t problem_size_0_m =
+          cutlass::fast_min((int32_t)kQueriesPerBlock, p.num_queries);
+      int32_t problem_size_0_n = cutlass::fast_min(
+          int32_t(kKeysPerBlock), p.num_keys - iter_key_start);
+      int32_t const& problem_size_0_k = p.head_dim;
+      int32_t const& problem_size_1_n = p.head_dim_value;
+      int32_t const& problem_size_1_k = problem_size_0_n;
+
+      auto prologueV = [&](int blockN) {
+        typename MM1::Mma::IteratorB iterator_V(
+            typename MM1::IteratorB::Params{typename MM1::LayoutB(p.v_strideM)},
+            p.value_ptr + iter_key_start * p.v_strideM,
+            {problem_size_1_k, problem_size_1_n},
+            thread_id(),
+            cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
+        MM1::Mma::prologue(
+            shared_storage.after_mm0.mm1,
+            iterator_V,
+            thread_id(),
+            problem_size_1_k);
+      };
+
+      __syncthreads(); // Need to have shared memory initialized, and `m_prime`
+                       // updated from end of prev iter
+      //
+      // MATMUL: Q.K_t
+      //
+      // Computes the block-matrix product of:
+      // (a) query[query_start:query_end, :]
+      // with
+      // (b) key[iter_key_start:iter_key_start + kKeysPerBlock]
+      // and stores that into `shared_storage.si`
+      //
+
+      // Compute threadblock location
+      cutlass::gemm::GemmCoord tb_tile_offset = {0, 0, 0};
+
+      cutlass::MatrixCoord tb_offset_A{
+          tb_tile_offset.m() * MM0::Mma::Shape::kM, tb_tile_offset.k()};
+
+      cutlass::MatrixCoord tb_offset_B{
+          tb_tile_offset.k(), tb_tile_offset.n() * MM0::Mma::Shape::kN};
+
+      // Construct iterators to A and B operands
+      typename MM0::IteratorA iterator_A(
+          typename MM0::IteratorA::Params(
+              typename MM0::MmaCore::LayoutA(p.q_strideM)),
+          p.query_ptr,
+          {problem_size_0_m, problem_size_0_k},
+          thread_id(),
+          tb_offset_A);
+
+      typename MM0::IteratorB iterator_B(
+          typename MM0::IteratorB::Params(
+              typename MM0::MmaCore::LayoutB(p.k_strideM)),
+          p.key_ptr + iter_key_start * p.k_strideM,
+          {problem_size_0_k, problem_size_0_n},
+          thread_id(),
+          tb_offset_B);
+
+      auto my_warp_id = warp_uniform(warp_id());
+      auto my_lane_id = lane_id();
+
+      // Construct thread-scoped matrix multiply
+      typename MM0::Mma mma(
+          shared_storage.mm0, thread_id(), my_warp_id, my_lane_id);
+
+      typename MM0::Mma::FragmentC accum;
+
+      accum.clear();
+
+      auto gemm_k_iterations =
+          (problem_size_0_k + MM0::Mma::Shape::kK - 1) / MM0::Mma::Shape::kK;
+
+      // Compute threadblock-scoped matrix multiply-add
+      mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
+      __syncthreads();
+
+      if (kPreloadV) {
+        prologueV(0);
+      } else {
+        MM1::Mma::drain_cp_asyncs();
+      }
+
+      typename MM0::Mma::Operator::IteratorC::TensorCoord
+          iteratorC_tile_offset = {
+              (tb_tile_offset.m() * MM0::Mma::WarpCount::kM) +
+                  (my_warp_id % MM0::Mma::WarpCount::kM),
+              (tb_tile_offset.n() * MM0::Mma::WarpCount::kN) +
+                  (my_warp_id / MM0::Mma::WarpCount::kM)};
+
+      // multiply by scaling factor
+      if (kSupportsBias) {
+        accum =
+            cutlass::multiplies<typename MM0::Mma::FragmentC>()(p.scale, accum);
+      }
+
+      // apply attention bias if applicable
+      if (kSupportsBias && p.attn_bias_ptr != nullptr) {
+        // load bias tile Bij into shared memory
+        typename MM0::BiasLoader::GmemTileIterator bias_iter(
+            {cutlass::layout::RowMajor(p.bias_strideM)},
+            // attn_bias_pointer points to matrix of size (n_queries, n_keys)
+            // for the relevant batch_id and head_id
+            p.attn_bias_ptr + query_start * p.bias_strideM + iter_key_start,
+            {problem_size_0_m, problem_size_0_n},
+            thread_id());
+        cutlass::TensorRef<scalar_t, cutlass::layout::RowMajor> bias_tensor_ref(
+            shared_storage.after_mm0.bias.data(),
+            cutlass::layout::RowMajor(MM0::ThreadblockShape::kN));
+        typename MM0::BiasLoader::SmemTileIterator smem_tile_iter(
+            bias_tensor_ref, thread_id());
+        MM0::BiasLoader::load(bias_iter, smem_tile_iter);
+
+        // Pij += Bij, Pij is in register fragment and Bij is in shared memory
+        auto lane_offset = MM0::AccumLambdaIterator::get_lane_offset(
+            my_lane_id, my_warp_id, iteratorC_tile_offset);
+        MM0::AccumLambdaIterator::iterateRows(
+            lane_offset,
+            [&](int accum_m) {},
+            [&](int accum_m, int accum_n, int idx) {
+              if (accum_m < problem_size_0_m && accum_n < problem_size_0_n) {
+                accum[idx] += bias_tensor_ref.at({accum_m, accum_n});
+              }
+            },
+            [&](int accum_m) {});
+      }
+
+      // Mask out last if causal
+      // This is only needed if upper-right corner of current query / key block
+      // intersects the mask Coordinates of upper-right corner of current block
+      // is y=query_start x=min(iter_key_start + kKeysPerBlock, num_keys)) The
+      // first masked element is x = y + offset -> query_start + offset There is
+      // intersection (and we need to mask) if min(iter_key_start +
+      // kKeysPerBlock, num_keys)) >= query_start + offset
+      if (p.custom_mask_type &&
+          cutlass::fast_min(iter_key_start + kKeysPerBlock, p.num_keys) >=
+              (query_start + p.causal_diagonal_offset)) {
+        auto query_start = blockIdx.x * kQueriesPerBlock;
+        auto lane_offset = MM0::AccumLambdaIterator::get_lane_offset(
+            my_lane_id, my_warp_id, iteratorC_tile_offset);
+        int32_t last_col;
+        MM0::AccumLambdaIterator::iterateRows(
+            lane_offset,
+            [&](int accum_m) {
+              // last absolute col is (last absolute query + offset)
+              // last local col is (last absolute query + offset -
+              // iter_key_start)
+              last_col = query_start + accum_m + p.causal_diagonal_offset -
+                  iter_key_start;
+            },
+            [&](int accum_m, int accum_n, int idx) {
+              if (accum_n > last_col) {
+                accum[idx] =
+                    -cutlass::platform::numeric_limits<accum_t>::infinity();
+              }
+            },
+            [&](int accum_m) {});
+      }
+      // Update `mi` from accum stored in registers
+      // Also does accum[i] <- exp(accum[i] - mi)
+      iterative_softmax<typename MM0::Mma::Operator::IteratorC>(
+          accum_o,
+          accum,
+          mi,
+          m_prime,
+          s_prime,
+          out_rescale,
+          shared_storage.addition_storage,
+          my_lane_id,
+          thread_id(),
+          my_warp_id,
+          p.num_keys - iter_key_start,
+          iter_key_start == 0,
+          iteratorC_tile_offset,
+          kSupportsBias ? 1.0f : p.scale);
+
+      // Output results to shared-memory
+      int warp_idx_mn_0 = my_warp_id %
+          (MM0::Mma::Base::WarpCount::kM * MM0::Mma::Base::WarpCount::kN);
+      auto output_tile_coords = cutlass::MatrixCoord{
+          warp_idx_mn_0 % MM0::Mma::Base::WarpCount::kM,
+          warp_idx_mn_0 / MM0::Mma::Base::WarpCount::kM};
+
+      MM0::B2bGemm::accumToSmem(
+          shared_storage.after_mm0.si, accum, my_lane_id, output_tile_coords);
+
+      __syncthreads();
+
+#ifdef HAS_PYTORCH
+      // apply dropout (if applicable) after we've written Pij to smem.
+      // dropout is applied by multiplying each element of Pij by:
+      // - 0 with probability dropout_p
+      // - 1 / (1 - dropout_p) with probability 1 - dropout_p
+      //
+      // for backward purposes we want to be able to map each element of the
+      // attention matrix to the same random uniform number as the one we used
+      // in forward, without needing to use the same iteration order or having
+      // to store the dropout matrix. its possible to do this in registers but
+      // it ends up being very slow because each thread having noncontiguous
+      // strips of the Pij tile means we have to skip around a lot, and also
+      // have to generate a single random number at a time
+      if (kSupportsDropout && p.use_dropout) {
+        auto si = shared_storage.after_mm0.si.accum_ref();
+        // each thread handles a contiguous sequence of elements from Sij, all
+        // coming from the same row. the reason they have to come from the same
+        // row is that the sampling random numbers from a contiguous random
+        // number sequence is much more efficient than jumping around, and the
+        // linear offset of each element of S (the global matrix) maps to an
+        // offset in a random number sequence. for S, the end of a row and the
+        // beginning of the next have adjacent offsets, but for Sij, this is not
+        // necessarily the case.
+        const int num_threads = blockDim.x * blockDim.y * blockDim.z;
+        const int threads_per_row =
+            cutlass::fast_min(num_threads / problem_size_0_m, problem_size_0_n);
+        const int elts_per_thread = cutlass::round_nearest(
+            cutlass::ceil_div(problem_size_0_n, threads_per_row), 4);
+
+        const int thread_i = thread_id() / threads_per_row;
+        const int thread_start_j =
+            (thread_id() % threads_per_row) * elts_per_thread;
+
+        if (thread_i < problem_size_0_m && thread_start_j < problem_size_0_n) {
+          curandStatePhilox4_32_10_t curand_state = curand_state_init;
+          skipahead(
+              static_cast<unsigned long long>(
+                  (query_start + thread_i) * p.num_keys_absolute +
+                  (iter_key_start + thread_start_j)),
+              &curand_state);
+          const float dropout_scale = 1.0 / (1.0 - p.dropout_prob);
+
+          // apply dropout scaling to elements this thread is responsible for,
+          // in chunks of 4
+          for (int sij_start_col_idx = thread_start_j; sij_start_col_idx <
+               cutlass::fast_min(thread_start_j + elts_per_thread,
+                                 problem_size_0_n);
+               sij_start_col_idx += 4) {
+            const float4 rand_uniform_quad = curand_uniform4(&curand_state);
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int quad_idx = 0; quad_idx < 4; ++quad_idx) {
+              si.at({thread_i, sij_start_col_idx + quad_idx}) *=
+                  static_cast<scalar_t>(
+                      dropout_scale *
+                      ((&rand_uniform_quad.x)[quad_idx] > p.dropout_prob));
+            }
+          }
+        }
+        __syncthreads(); // p.use_dropout should have same value kernel-wide
+      }
+#endif
+
+      //
+      // MATMUL: Attn . V
+      // Run the matmul `attn @ V` for a block of attn and V.
+      // `attn` is read from shared memory (in `shared_storage_si`)
+      // `V` is read from global memory (with iterator_B)
+      //
+
+      const int64_t nBlockN = kSingleValueIteration
+          ? 1
+          : ceil_div(
+                (int64_t)problem_size_1_n, int64_t(MM1::ThreadblockShape::kN));
+      for (int blockN = 0; blockN < nBlockN; ++blockN) {
+        int gemm_k_iterations =
+            (problem_size_1_k + MM1::Mma::Shape::kK - 1) / MM1::Mma::Shape::kK;
+
+        // Compute threadblock-scoped matrix multiply-add and store it in accum
+        // (in registers)
+        if (!kPreloadV) {
+          __syncthreads(); // we share shmem between mma and epilogue
+        }
+
+        typename MM1::Mma::IteratorB iterator_V(
+            typename MM1::IteratorB::Params{typename MM1::LayoutB(p.v_strideM)},
+            p.value_ptr + iter_key_start * p.v_strideM,
+            {problem_size_1_k, problem_size_1_n},
+            thread_id(),
+            cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
+        typename MM1::Mma mma_pv(
+            // operand A: Pij_dropped in shared memory
+            shared_storage.after_mm0.si.accum_ref(),
+            // operand B: shared memory staging area for Vj, which is loaded
+            // from global memory
+            shared_storage.after_mm0.mm1.operand_B_ref(),
+            (int)thread_id(),
+            (int)my_warp_id,
+            (int)my_lane_id);
+        mma_pv.set_prologue_done(kPreloadV);
+        if (!kKeepOutputInRF) {
+          accum_o.clear();
+        }
+        mma_pv(gemm_k_iterations, accum_o, iterator_V, accum_o);
+        __syncthreads();
+
+        if (kPreloadV && !kSingleValueIteration && blockN + 1 < nBlockN) {
+          prologueV(blockN + 1);
+        }
+
+        if (!kKeepOutputInRF) {
+          MM1::Mma::drain_cp_asyncs();
+          DISPATCH_BOOL(
+              iter_key_start == 0, kIsFirst, ([&] {
+                DISPATCH_BOOL(
+                    (iter_key_start + kKeysPerBlock) >= p.num_keys,
+                    kIsLast,
+                    ([&] {
+                      using DefaultEpilogue = typename MM1::DefaultEpilogue;
+                      using DefaultOp =
+                          typename MM1::DefaultConfig::EpilogueOutputOp;
+                      using ElementCompute = typename DefaultOp::ElementCompute;
+                      using EpilogueOutputOp = typename cutlass::epilogue::
+                          thread::MemoryEfficientAttentionNormalize<
+                              typename cutlass::platform::conditional<
+                                  kIsLast::value,
+                                  output_t,
+                                  output_accum_t>::type,
+                              output_accum_t,
+                              DefaultOp::kCount,
+                              typename DefaultOp::ElementAccumulator,
+                              ElementCompute,
+                              kIsFirst::value,
+                              kIsLast::value,
+                              cutlass::Array<ElementCompute, kQueriesPerBlock>>;
+                      using Epilogue = typename cutlass::epilogue::threadblock::
+                          EpiloguePipelined<
+                              typename DefaultEpilogue::Shape,
+                              typename MM1::Mma::Operator,
+                              DefaultEpilogue::kPartitionsK,
+                              typename cutlass::platform::conditional<
+                                  kIsLast::value,
+                                  typename MM1::OutputTileIterator,
+                                  typename MM1::OutputTileIteratorAccum>::type,
+                              typename DefaultEpilogue::
+                                  AccumulatorFragmentIterator,
+                              typename DefaultEpilogue::WarpTileIterator,
+                              typename DefaultEpilogue::SharedLoadIterator,
+                              EpilogueOutputOp,
+                              typename DefaultEpilogue::Padding,
+                              DefaultEpilogue::kFragmentsPerIteration,
+                              true, // IterationsUnroll
+                              typename MM1::OutputTileIteratorAccum // Read
+                                                                    // iterator
+                              >;
+
+                      int col = blockN * MM1::Mma::Shape::kN;
+                      auto source_iter = createOutputAccumIter(col);
+                      auto dest_iter = call_conditional<
+                          kIsLast::value,
+                          decltype(createOutputIter),
+                          decltype(createOutputAccumIter)>::
+                          apply(createOutputIter, createOutputAccumIter, col);
+                      EpilogueOutputOp rescale(s_prime, out_rescale);
+                      Epilogue epilogue(
+                          shared_storage.epilogue_shared_storage(),
+                          thread_id(),
+                          my_warp_id,
+                          my_lane_id);
+                      epilogue(rescale, dest_iter, accum_o, source_iter);
+                    }));
+              }));
+          if (!kSingleValueIteration) {
+            __syncthreads();
+          }
+        }
+      }
+      __syncthreads(); // we modify `m_prime` after
+    }
+
+    if (kKeepOutputInRF) {
+      constexpr bool kIsFirst = true;
+      constexpr bool kIsLast = true;
+      using DefaultEpilogue = typename MM1::DefaultEpilogue;
+      using DefaultOp = typename MM1::DefaultConfig::EpilogueOutputOp;
+      using ElementCompute = typename DefaultOp::ElementCompute;
+      using EpilogueOutputOp =
+          typename cutlass::epilogue::thread::MemoryEfficientAttentionNormalize<
+              output_t, // output
+              output_accum_t, // source
+              DefaultOp::kCount,
+              typename DefaultOp::ElementAccumulator, // accum
+              output_accum_t, // compute
+              kIsFirst,
+              kIsLast,
+              cutlass::Array<ElementCompute, kQueriesPerBlock>>;
+      using Epilogue =
+          typename cutlass::epilogue::threadblock::EpiloguePipelined<
+              typename DefaultEpilogue::Shape,
+              typename MM1::Mma::Operator,
+              DefaultEpilogue::kPartitionsK,
+              typename MM1::OutputTileIterator, // destination
+              typename DefaultEpilogue::AccumulatorFragmentIterator,
+              typename DefaultEpilogue::WarpTileIterator,
+              typename DefaultEpilogue::SharedLoadIterator,
+              EpilogueOutputOp,
+              typename DefaultEpilogue::Padding,
+              DefaultEpilogue::kFragmentsPerIteration,
+              true, // IterationsUnroll
+              typename MM1::OutputTileIteratorAccum // source tile
+              >;
+      auto dest_iter = createOutputIter(0);
+      EpilogueOutputOp rescale(s_prime, out_rescale);
+      Epilogue epilogue(
+          shared_storage.epilogue_shared_storage(),
+          thread_id(),
+          warp_id(),
+          lane_id());
+      MM1::Mma::drain_cp_asyncs();
+      epilogue(rescale, dest_iter, accum_o);
+    }
+
+    // 7. Calculate logsumexp
+    // To make the backward easier, we pad logsumexp with `inf`
+    // this avoids a few bound checks, and is not more expensive during fwd
+    static_assert(kQueriesPerBlock < kNumWarpsPerBlock * kWarpSize, "");
+    if (p.logsumexp_ptr && thread_id() < kQueriesPerBlock) {
+      auto lse_dim = ceil_div((int32_t)p.num_queries, kAlignLSE) * kAlignLSE;
+      constexpr float kLog2e = 1.4426950408889634074; // log_2(e) = M_LOG2E
+      if (thread_id() < p.num_queries) {
+        p.logsumexp_ptr[thread_id()] = accum_t(mi[thread_id()] / kLog2e) +
+            cutlass::fast_log(accum_t(s_prime[thread_id()]));
+      } else if (thread_id() < lse_dim) {
+        p.logsumexp_ptr[thread_id()] =
+            cutlass::platform::numeric_limits<accum_t>::infinity();
+      }
+    }
+  }
+
+  template <typename WarpIteratorC>
+  CUTLASS_DEVICE static void iterative_softmax(
+      typename WarpIteratorC::Fragment& frag_o, // output so far
+      typename WarpIteratorC::Fragment& frag,
+      cutlass::Array<accum_t, kQueriesPerBlock>& mi,
+      cutlass::Array<accum_t, kQueriesPerBlock>& m_prime,
+      cutlass::Array<accum_t, kQueriesPerBlock>& s_prime,
+      cutlass::Array<accum_t, kQueriesPerBlock>& out_rescale,
+      cutlass::Array<accum_t, kQueriesPerBlock * MM0::MmaCore::WarpCount::kN>&
+          addition_storage,
+      int8_t lane_id,
+      int8_t thread_id,
+      int8_t warp_id,
+      int max_col,
+      bool is_first,
+      typename WarpIteratorC::TensorCoord const& tile_offset,
+      float scaling) {
+    /* Iterates on the accumulator and corresponding position on result matrix
+
+    (1) Update `mi[r]` to the max value of the row `r`
+    (2) In a second iteration do the following:
+        (a) accum   <- exp(accum - mi)
+        (b) m_prime <- exp(m_prime - mi)
+        (c) s_prime <- s_prime * m_prime + sum(accum)
+
+    All of this is done on registers, before we store all of this
+    on shared memory for the next matmul with Value.
+    */
+    using Fragment = typename WarpIteratorC::Fragment;
+    using LambdaIterator = typename DefaultMmaAccumLambdaIterator<
+        WarpIteratorC,
+        accum_t,
+        kWarpSize>::Iterator;
+    // Convert to `accum_t` (rather than double)
+    constexpr float kLog2e = 1.4426950408889634074; // log_2(e) = M_LOG2E
+
+    static_assert(kQueriesPerBlock % kNumWarpsPerBlock == 0, "");
+    static constexpr int kLinesPerWarp = kQueriesPerBlock / kNumWarpsPerBlock;
+
+    frag = cutlass::multiplies<Fragment>()(scaling * kLog2e, frag);
+
+    auto lane_offset =
+        LambdaIterator::get_lane_offset(lane_id, warp_id, tile_offset);
+
+    // First update `mi` to the max per-row
+    {
+      accum_t max;
+      LambdaIterator::iterateRows(
+          lane_offset,
+          [&](int accum_m) {
+            max = -cutlass::platform::numeric_limits<accum_t>::infinity();
+          },
+          [&](int accum_m, int accum_n, int idx) {
+            if (accum_n < max_col) {
+              max = cutlass::fast_max(max, frag[idx]);
+            }
+          },
+          [&](int accum_m) {
+            // Having 4x atomicMax seems faster than reduce within warp
+            // first...
+            atomicMaxFloat(&mi[accum_m], max);
+          });
+    }
+
+    // Make sure we all share the update values for `mi`
+    __syncthreads();
+
+    // Doing this `exp` is quite expensive. Let's
+    // split it across the warps
+    bool restore_mi_to_minus_inf = false;
+    if (lane_id < kLinesPerWarp) {
+      int id = warp_id * kLinesPerWarp + lane_id;
+      auto m_prime_id = m_prime[id];
+      auto mi_id = mi[id];
+      bool changed = m_prime_id < mi_id; // `false` if both are -inf
+      if (changed) {
+        auto m_prime_exp = exp2f(m_prime_id - mi_id);
+        out_rescale[id] = m_prime_exp;
+        s_prime[id] *= m_prime_exp;
+      } else {
+        // Only when bias is enabled, it's possible that all the first values
+        // of attention are masked to `-inf`. In that case we want to avoid
+        // `nan = exp2f(-inf - (-inf))` so we temporarily set `mi` to 0
+        if (kSupportsBias &&
+            mi_id == -cutlass::platform::numeric_limits<accum_t>::infinity()) {
+          restore_mi_to_minus_inf = true;
+          mi[id] = 0.0f;
+        }
+        out_rescale[id] = 1.0f;
+      }
+    }
+    __syncthreads(); // Update output fragments
+    if (kKeepOutputInRF && !is_first) {
+      accum_t line_rescale;
+      LambdaIterator::iterateRows(
+          lane_offset,
+          [&](int accum_m) { line_rescale = out_rescale[accum_m]; },
+          [&](int accum_m, int accum_n, int idx) {
+            frag_o[idx] = frag_o[idx] * line_rescale;
+          },
+          [&](int accum_m) {});
+    }
+    // Update accum_m, accum_n, ...
+    {
+      accum_t mi_row, total_row;
+      LambdaIterator::iterateRows(
+          lane_offset,
+          [&](int accum_m) { mi_row = mi[accum_m]; },
+          [&](int accum_m, int accum_n, int idx) {
+            frag[idx] =
+                (accum_n < max_col) ? exp2f(frag[idx] - mi_row) : accum_t(0.0);
+          },
+          [&](int accum_m) {});
+      LambdaIterator::iterateRows(
+          lane_offset,
+          [&](int accum_m) { total_row = 0.0; },
+          [&](int accum_m, int accum_n, int idx) { total_row += frag[idx]; },
+          [&](int accum_m) {
+            if (LambdaIterator::reduceSameRow(
+                    lane_id, total_row, [](accum_t a, accum_t b) {
+                      return a + b;
+                    })) {
+              // NOTE: we could atomically add `total_row` to `s_prime`, but
+              // it's faster (and deterministic) to avoid atomics here
+              addition_storage
+                  [accum_m + kQueriesPerBlock * tile_offset.column()] =
+                      total_row;
+            }
+          });
+    }
+    __syncthreads();
+    if (lane_id < kLinesPerWarp) {
+      int id = warp_id * kLinesPerWarp + lane_id;
+      accum_t total_row = s_prime[id];
+      if (restore_mi_to_minus_inf) {
+        // Restore `mi`, see above when we set `restore_mi_to_minus_inf=true`
+        mi[id] = -cutlass::platform::numeric_limits<accum_t>::infinity();
+      } else {
+        m_prime[id] = mi[id];
+      }
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < MM0::MmaCore::WarpCount::kN; ++i) {
+        total_row += addition_storage[id + kQueriesPerBlock * i];
+      }
+      s_prime[id] = total_row;
+    }
+  }
+
+  static CUTLASS_DEVICE int8_t lane_id() {
+    return threadIdx.x;
+  }
+  static CUTLASS_DEVICE int8_t warp_id() {
+    return threadIdx.y;
+  }
+  static CUTLASS_DEVICE int16_t thread_id() {
+    return threadIdx.x + threadIdx.y * blockDim.x;
+  }
+};
+
+template <typename AK>
+__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
+    attention_kernel_batched_impl(typename AK::Params p) {
+  if (!p.advance_to_block()) {
+    return;
+  }
+  AK::attention_kernel(p);
+}
+
+template <typename AK>
+__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
+    attention_kernel_batched(typename AK::Params params);
diff --git a/third_party/fused_multi_head_attention/piped_subprocess.py b/third_party/fused_multi_head_attention/piped_subprocess.py
new file mode 100644
index 0000000000..536bdb4305
--- /dev/null
+++ b/third_party/fused_multi_head_attention/piped_subprocess.py
@@ -0,0 +1,144 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from typing import List
+import torch
+import subprocess
+import sys
+import tempfile
+import os
+import numpy as np
+
+
+TORCH_DTYPE_NAME = {
+    torch.float32: "f32",
+    torch.float16: "f16",
+    torch.bfloat16: "b16"
+}
+NAME_TORCH_DTYPE = {v: k for k, v in TORCH_DTYPE_NAME.items()}
+
+def _tensor_from_storage(tensor: torch.Tensor, dtype) -> torch.Tensor:
+    # PyTorch >= 2.0
+    if hasattr(tensor, 'untyped_storage'):
+        return torch.tensor([], dtype=dtype).set_(tensor.untyped_storage())
+    return torch.tensor([], dtype=dtype).set_(tensor.storage().untyped())
+
+class PipedSubprocess:
+    def __init__(self, binary: str) -> None:
+        self.binary = binary
+        self.tempdir_ctx = tempfile.TemporaryDirectory()
+
+    def __enter__(self) -> "PipedSubprocess":
+        self.subp = subprocess.Popen(self.binary, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=sys.stderr, text=True, bufsize=0)
+        self.tempdir = self.tempdir_ctx.__enter__()
+        self.file_counter = 0
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+        self.tempdir_ctx.__exit__(exc_type, exc_val, exc_tb)
+
+    def temp_filename(self, suffix: str) -> str:
+        self.file_counter += 1
+        return os.path.join(self.tempdir, f"{self.file_counter}{suffix}")
+
+    def write(self, *args) -> None:
+        for a in args:
+            self.subp.stdin.write(str(a) + " ")
+
+    def writeTensor(self, tensor: torch.Tensor, name: str, stride_names: List[str]) -> None:
+        print(f"Py ->C++: {TORCH_DTYPE_NAME[tensor.dtype]}:{name}")
+        tensor_u8 = _tensor_from_storage(tensor, torch.uint8)
+        self.write("tensor_begin", f"{TORCH_DTYPE_NAME[tensor.dtype]}:{name}", tensor_u8.shape[0])
+        filename = self.temp_filename(f"{name}.tensor")
+        assert tensor.storage_offset() == 0
+        with open(filename, "wb+") as fd:
+            fd.write(bytes(tensor_u8.numpy()))
+        self.write("file", filename)
+        self.write("tensor_end")
+
+        for stride_name, stride_value in zip(stride_names, tensor.stride()):
+            self.write(stride_name, stride_value)
+
+    def readTensor(self, name, stride_name, shape) -> torch.Tensor:
+        tmpfile = self.temp_filename(f"{name}.tensor")
+        self.write("tmpfile", tmpfile)
+
+        self.readExpect("tensor_begin")
+        dtype_str, name = self.read().split(":")
+        print(f"C++->Py : {dtype_str}:{name}")
+        u8len = int(self.read())
+        dtype = NAME_TORCH_DTYPE[dtype_str]
+
+        self.readExpect("file")
+        self.readExpect(tmpfile)
+
+        with open(tmpfile, "rb") as fd:
+            data = fd.read(u8len)
+            # `np.array` is not strictly needed, but avoids a torch warning
+            tensor_u8 = torch.frombuffer(np.array(data), dtype=torch.uint8, count=u8len)
+        self.readExpect("tensor_end")
+        
+        tensor = _tensor_from_storage(tensor_u8, dtype)
+        strides = []
+        for sn in stride_name:
+            self.readExpect(sn)
+            strides.append(int(self.read()))
+        if len(strides) != shape:
+            strides.append(1)
+        assert len(strides) == len(shape), name
+        return torch.as_strided(tensor, shape, strides)
+
+    def readNamed(self, name: str):
+        self.readExpect(name)
+        return self.read()
+
+    def readExpect(self, what: str) -> None:
+        r = self.read()
+        if r != what:
+            raise ValueError(f"Read {r} but expected {what}")
+
+    def read(self):
+        read_all = []
+        # Skip initial whitespace
+        while True:
+            r = self.subp.stdout.read(1)
+            if r not in [' ', "\n"]:
+                read_all.append(r)
+                break
+        # Read data
+        while True:
+            r = self.subp.stdout.read(1)
+            if r in [' ', "\n"]:
+                break
+            read_all.append(r)
+        return ''.join(read_all)
+        
diff --git a/third_party/fused_multi_head_attention/transform/tile_smem_loader.h b/third_party/fused_multi_head_attention/transform/tile_smem_loader.h
new file mode 100644
index 0000000000..048c1e019b
--- /dev/null
+++ b/third_party/fused_multi_head_attention/transform/tile_smem_loader.h
@@ -0,0 +1,90 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cutlass/cutlass.h>
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+template <
+    typename scalar_t, // scalar type
+    typename ThreadblockTileShape, // size of tile to load
+    int Threads, // number of participating threads
+    int ElementsPerAccess> // thread access width in elements
+class TileSmemLoader {
+ public:
+  using SmemTile =
+      cutlass::AlignedBuffer<scalar_t, ThreadblockTileShape::kCount>;
+
+  using ThreadMap = cutlass::transform::PitchLinearStripminedThreadMap<
+      cutlass::layout::PitchLinearShape<
+          ThreadblockTileShape::kColumn, // contiguous
+          ThreadblockTileShape::kRow>, // strided
+      Threads, // Threads
+      ElementsPerAccess>; // ElementsPerAccess
+
+  using GmemTileIterator =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          ThreadblockTileShape, // Shape
+          scalar_t, // Element
+          cutlass::layout::RowMajor, // Layout
+          0, // AdvanceRank
+          ThreadMap>; // ThreadMap
+
+  using SmemTileIterator = cutlass::transform::threadblock::RegularTileIterator<
+      ThreadblockTileShape, // Shape
+      scalar_t, // Element
+      cutlass::layout::RowMajor, // Layout
+      0, // AdvanceRank
+      ThreadMap>; // ThreadMap
+
+  using Fragment = typename GmemTileIterator::Fragment;
+
+  /// load a tile from global memory into shared memory
+  CUTLASS_DEVICE
+  static void load(
+      GmemTileIterator tile_load_iter,
+      SmemTileIterator tile_store_iter) {
+    Fragment tb_frag;
+    tb_frag.clear();
+    tile_load_iter.load(tb_frag);
+    tile_store_iter.store(tb_frag);
+
+    __syncthreads();
+  }
+};
diff --git a/third_party/cl2.hpp b/third_party/opencl.hpp
similarity index 60%
rename from third_party/cl2.hpp
rename to third_party/opencl.hpp
index 711b429e9b..c71c009378 100644
--- a/third_party/cl2.hpp
+++ b/third_party/opencl.hpp
@@ -1,35 +1,23 @@
-/*******************************************************************************
- * Copyright (c) 2008-2016 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
- * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
- * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
- *    https://www.khronos.org/registry/
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- ******************************************************************************/
+//
+// Copyright (c) 2008-2024 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 /*! \file
  *
- *   \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33),
- *       OpenCL 1.2 (rev 15) and OpenCL 2.0 (rev 29)
+ *   \brief C++ bindings for OpenCL 1.0, OpenCL 1.1, OpenCL 1.2,
+ *       OpenCL 2.0, OpenCL 2.1, OpenCL 2.2, and OpenCL 3.0.
  *   \author Lee Howes and Bruce Merry
  *
  *   Derived from the OpenCL 1.x C++ bindings written by
@@ -40,20 +28,17 @@
  *       Bruce Merry, February 2013.
  *       Tom Deakin and Simon McIntosh-Smith, July 2013
  *       James Price, 2015-
- *
- *   \version 2.0.10
- *   \date 2016-07-20
+ *   \version 2.2.0
+ *   \date 2019-09-18
  *
  *   Optional extension support
  *
- *         cl_ext_device_fission
- *         #define CL_HPP_USE_CL_DEVICE_FISSION
  *         cl_khr_d3d10_sharing
  *         #define CL_HPP_USE_DX_INTEROP
+ *         cl_khr_il_program
+ *         #define CL_HPP_USE_IL_KHR
  *         cl_khr_sub_groups
  *         #define CL_HPP_USE_CL_SUB_GROUPS_KHR
- *         cl_khr_image2d_from_buffer
- *         #define CL_HPP_USE_CL_IMAGE2D_FROM_BUFFER_KHR
  *
  *   Doxygen documentation for this header is available here:
  *
@@ -73,10 +58,10 @@
  * For many large applications C++ is the language of choice and so it seems
  * reasonable to define C++ bindings for OpenCL.
  *
- * The interface is contained with a single C++ header file \em cl2.hpp and all
+ * The interface is contained with a single C++ header file \em opencl.hpp and all
  * definitions are contained within the namespace \em cl. There is no additional
  * requirement to include \em cl.h and to use either the C++ or original C
- * bindings; it is enough to simply include \em cl2.hpp.
+ * bindings; it is enough to simply include \em opencl.hpp.
  *
  * The bindings themselves are lightweight and correspond closely to the
  * underlying C API. Using the C++ bindings introduces no additional execution
@@ -85,7 +70,7 @@
  * There are numerous compatibility, portability and memory management
  * fixes in the new header as well as additional OpenCL 2.0 features.
  * As a result the header is not directly backward compatible and for this
- * reason we release it as cl2.hpp rather than a new version of cl.hpp.
+ * reason we release it as opencl.hpp rather than a new version of cl.hpp.
  * 
  *
  * \section compatibility Compatibility
@@ -95,9 +80,9 @@
  *
  * The combination of preprocessor macros CL_HPP_TARGET_OPENCL_VERSION and 
  * CL_HPP_MINIMUM_OPENCL_VERSION control this range. These are three digit
- * decimal values representing OpenCL runime versions. The default for 
- * the target is 200, representing OpenCL 2.0 and the minimum is also 
- * defined as 200. These settings would use 2.0 API calls only.
+ * decimal values representing OpenCL runtime versions. The default for 
+ * the target is 300, representing OpenCL 3.0.  The minimum is defined as 200.
+ * These settings would use 2.0 and newer API calls only.
  * If backward compatibility with a 1.2 runtime is required, the minimum
  * version may be set to 120.
  *
@@ -152,35 +137,36 @@
  * - CL_HPP_TARGET_OPENCL_VERSION
  *
  *   Defines the target OpenCL runtime version to build the header
+ *   against. Defaults to 300, representing OpenCL 3.0.
+ *
+ * - CL_HPP_MINIMUM_OPENCL_VERSION
+ *
+ *   Defines the minimum OpenCL runtime version to build the header
  *   against. Defaults to 200, representing OpenCL 2.0.
  *
  * - CL_HPP_NO_STD_STRING
  *
  *   Do not use the standard library string class. cl::string is not
- *   defined and may be defined by the user before cl2.hpp is
+ *   defined and may be defined by the user before opencl.hpp is
  *   included.
  *
  * - CL_HPP_NO_STD_VECTOR
  *
  *   Do not use the standard library vector class. cl::vector is not
- *   defined and may be defined by the user before cl2.hpp is
+ *   defined and may be defined by the user before opencl.hpp is
  *   included.
  *
  * - CL_HPP_NO_STD_ARRAY
  *
  *   Do not use the standard library array class. cl::array is not
- *   defined and may be defined by the user before cl2.hpp is
+ *   defined and may be defined by the user before opencl.hpp is
  *   included.
  *
  * - CL_HPP_NO_STD_UNIQUE_PTR
  *
  *   Do not use the standard library unique_ptr class. cl::pointer and
  *   the cl::allocate_pointer functions are not defined and may be
- *   defined by the user before cl2.hpp is included.
- *
- * - CL_HPP_ENABLE_DEVICE_FISSION
- *
- *   Enables device fission for OpenCL 1.2 platforms.
+ *   defined by the user before opencl.hpp is included.
  *
  * - CL_HPP_ENABLE_EXCEPTIONS
  *
@@ -208,18 +194,35 @@
  *   build variants.
  *
  *
+ * - CL_HPP_USE_CL_SUB_GROUPS_KHR
+ *
+ *   Enable the cl_khr_subgroups extension.
+ *
+ * - CL_HPP_USE_DX_INTEROP
+ *
+ *   Enable the cl_khr_d3d10_sharing extension.
+ *
+ * - CL_HPP_USE_IL_KHR
+ *
+ *   Enable the cl_khr_il_program extension.
+ *
+ *
  * \section example Example
  *
  * The following example shows a general use case for the C++
  * bindings, including support for the optional exception feature and
  * also the supplied vector and string classes, see following sections for
  * decriptions of these features.
+ * 
+ * Note: the C++ bindings use std::call_once and therefore may need to be
+ * compiled using special command-line options (such as "-pthread") on some
+ * platforms!
  *
  * \code
     #define CL_HPP_ENABLE_EXCEPTIONS
     #define CL_HPP_TARGET_OPENCL_VERSION 200
 
-    #include <CL/cl2.hpp>
+    #include <CL/opencl.hpp>
     #include <iostream>
     #include <vector>
     #include <memory>
@@ -229,28 +232,30 @@
 
     int main(void)
     {
-        // Filter for a 2.0 platform and set it as the default
+        // Filter for a 2.0 or newer platform and set it as the default
         std::vector<cl::Platform> platforms;
         cl::Platform::get(&platforms);
         cl::Platform plat;
         for (auto &p : platforms) {
             std::string platver = p.getInfo<CL_PLATFORM_VERSION>();
-            if (platver.find("OpenCL 2.") != std::string::npos) {
+            if (platver.find("OpenCL 2.") != std::string::npos ||
+                platver.find("OpenCL 3.") != std::string::npos) {
+                // Note: an OpenCL 3.x platform may not support all required features!
                 plat = p;
             }
         }
-        if (plat() == 0)  {
-            std::cout << "No OpenCL 2.0 platform found.";
+        if (plat() == 0) {
+            std::cout << "No OpenCL 2.0 or newer platform found.\n";
             return -1;
         }
 
         cl::Platform newP = cl::Platform::setDefault(plat);
         if (newP != plat) {
-            std::cout << "Error setting default platform.";
+            std::cout << "Error setting default platform.\n";
             return -1;
         }
 
-        // Use C++11 raw string literals for kernel source code
+        // C++11 raw string literal for the first kernel
         std::string kernel1{R"CLC(
             global int globalA;
             kernel void updateGlobal()
@@ -258,6 +263,8 @@
               globalA = 75;
             }
         )CLC"};
+
+        // Raw string literal for the second kernel
         std::string kernel2{R"CLC(
             typedef struct { global int *bar; } Foo;
             kernel void vectorAdd(global const Foo* aNum, global const int *inputA, global const int *inputB,
@@ -284,8 +291,9 @@
             }
         )CLC"};
 
-        // New simpler string interface style
-        std::vector<std::string> programStrings {kernel1, kernel2};
+        std::vector<std::string> programStrings;
+        programStrings.push_back(kernel1);
+        programStrings.push_back(kernel2);
 
         cl::Program vectorAddProgram(programStrings);
         try {
@@ -324,12 +332,11 @@
         std::vector<int, cl::SVMAllocator<int, cl::SVMTraitCoarse<>>> inputA(numElements, 1, svmAlloc);
         cl::coarse_svm_vector<int> inputB(numElements, 2, svmAlloc);
 
-        //
         //////////////
-
         // Traditional cl_mem allocations
+
         std::vector<int> output(numElements, 0xdeadbeef);
-        cl::Buffer outputBuffer(begin(output), end(output), false);
+        cl::Buffer outputBuffer(output.begin(), output.end(), false);
         cl::Pipe aPipe(sizeof(cl_int), numElements / 2);
 
         // Default command queue, also passed in as a parameter
@@ -351,14 +358,8 @@
         // This one was not passed as a parameter
         vectorAddKernel.setSVMPointers(anSVMInt);
 
-        // Hand control of coarse allocations to runtime
-        cl::enqueueUnmapSVM(anSVMInt);
-        cl::enqueueUnmapSVM(fooPointer);
-        cl::unmapSVM(inputB);
-        cl::unmapSVM(output2);
-
-	    cl_int error;
-	    vectorAddKernel(
+        cl_int error;
+        vectorAddKernel(
             cl::EnqueueArgs(
                 cl::NDRange(numElements/2),
                 cl::NDRange(numElements/2)),
@@ -369,12 +370,10 @@
             3,
             aPipe,
             defaultDeviceQueue,
-		    error
+            error
             );
 
-        cl::copy(outputBuffer, begin(output), end(output));
-        // Grab the SVM output vector using a map
-        cl::mapSVM(output2);
+        cl::copy(outputBuffer, output.begin(), output.end());
 
         cl::Device d = cl::Device::getDefault();
 
@@ -398,61 +397,80 @@
  * both and hence work with either version of the bindings.
  */
 #if !defined(CL_HPP_USE_DX_INTEROP) && defined(USE_DX_INTEROP)
-# pragma message("cl2.hpp: USE_DX_INTEROP is deprecated. Define CL_HPP_USE_DX_INTEROP instead")
+# pragma message("opencl.hpp: USE_DX_INTEROP is deprecated. Define CL_HPP_USE_DX_INTEROP instead")
 # define CL_HPP_USE_DX_INTEROP
 #endif
-#if !defined(CL_HPP_USE_CL_DEVICE_FISSION) && defined(USE_CL_DEVICE_FISSION)
-# pragma message("cl2.hpp: USE_CL_DEVICE_FISSION is deprecated. Define CL_HPP_USE_CL_DEVICE_FISSION instead")
-# define CL_HPP_USE_CL_DEVICE_FISSION
-#endif
 #if !defined(CL_HPP_ENABLE_EXCEPTIONS) && defined(__CL_ENABLE_EXCEPTIONS)
-# pragma message("cl2.hpp: __CL_ENABLE_EXCEPTIONS is deprecated. Define CL_HPP_ENABLE_EXCEPTIONS instead")
+# pragma message("opencl.hpp: __CL_ENABLE_EXCEPTIONS is deprecated. Define CL_HPP_ENABLE_EXCEPTIONS instead")
 # define CL_HPP_ENABLE_EXCEPTIONS
 #endif
 #if !defined(CL_HPP_NO_STD_VECTOR) && defined(__NO_STD_VECTOR)
-# pragma message("cl2.hpp: __NO_STD_VECTOR is deprecated. Define CL_HPP_NO_STD_VECTOR instead")
+# pragma message("opencl.hpp: __NO_STD_VECTOR is deprecated. Define CL_HPP_NO_STD_VECTOR instead")
 # define CL_HPP_NO_STD_VECTOR
 #endif
 #if !defined(CL_HPP_NO_STD_STRING) && defined(__NO_STD_STRING)
-# pragma message("cl2.hpp: __NO_STD_STRING is deprecated. Define CL_HPP_NO_STD_STRING instead")
+# pragma message("opencl.hpp: __NO_STD_STRING is deprecated. Define CL_HPP_NO_STD_STRING instead")
 # define CL_HPP_NO_STD_STRING
 #endif
 #if defined(VECTOR_CLASS)
-# pragma message("cl2.hpp: VECTOR_CLASS is deprecated. Alias cl::vector instead")
+# pragma message("opencl.hpp: VECTOR_CLASS is deprecated. Alias cl::vector instead")
 #endif
 #if defined(STRING_CLASS)
-# pragma message("cl2.hpp: STRING_CLASS is deprecated. Alias cl::string instead.")
+# pragma message("opencl.hpp: STRING_CLASS is deprecated. Alias cl::string instead.")
 #endif
 #if !defined(CL_HPP_USER_OVERRIDE_ERROR_STRINGS) && defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
-# pragma message("cl2.hpp: __CL_USER_OVERRIDE_ERROR_STRINGS is deprecated. Define CL_HPP_USER_OVERRIDE_ERROR_STRINGS instead")
+# pragma message("opencl.hpp: __CL_USER_OVERRIDE_ERROR_STRINGS is deprecated. Define CL_HPP_USER_OVERRIDE_ERROR_STRINGS instead")
 # define CL_HPP_USER_OVERRIDE_ERROR_STRINGS
 #endif
 
 /* Warn about features that are no longer supported
  */
 #if defined(__USE_DEV_VECTOR)
-# pragma message("cl2.hpp: __USE_DEV_VECTOR is no longer supported. Expect compilation errors")
+# pragma message("opencl.hpp: __USE_DEV_VECTOR is no longer supported. Expect compilation errors")
 #endif
 #if defined(__USE_DEV_STRING)
-# pragma message("cl2.hpp: __USE_DEV_STRING is no longer supported. Expect compilation errors")
+# pragma message("opencl.hpp: __USE_DEV_STRING is no longer supported. Expect compilation errors")
 #endif
 
 /* Detect which version to target */
 #if !defined(CL_HPP_TARGET_OPENCL_VERSION)
-# pragma message("cl2.hpp: CL_HPP_TARGET_OPENCL_VERSION is not defined. It will default to 200 (OpenCL 2.0)")
-# define CL_HPP_TARGET_OPENCL_VERSION 200
+# pragma message("opencl.hpp: CL_HPP_TARGET_OPENCL_VERSION is not defined. It will default to 300 (OpenCL 3.0)")
+# define CL_HPP_TARGET_OPENCL_VERSION 300
 #endif
-#if CL_HPP_TARGET_OPENCL_VERSION != 100 && CL_HPP_TARGET_OPENCL_VERSION != 110 && CL_HPP_TARGET_OPENCL_VERSION != 120 && CL_HPP_TARGET_OPENCL_VERSION != 200
-# pragma message("cl2.hpp: CL_HPP_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120 or 200). It will be set to 200")
+#if CL_HPP_TARGET_OPENCL_VERSION != 100 && \
+    CL_HPP_TARGET_OPENCL_VERSION != 110 && \
+    CL_HPP_TARGET_OPENCL_VERSION != 120 && \
+    CL_HPP_TARGET_OPENCL_VERSION != 200 && \
+    CL_HPP_TARGET_OPENCL_VERSION != 210 && \
+    CL_HPP_TARGET_OPENCL_VERSION != 220 && \
+    CL_HPP_TARGET_OPENCL_VERSION != 300
+# pragma message("opencl.hpp: CL_HPP_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220 or 300). It will be set to 300 (OpenCL 3.0).")
 # undef CL_HPP_TARGET_OPENCL_VERSION
-# define CL_HPP_TARGET_OPENCL_VERSION 200
+# define CL_HPP_TARGET_OPENCL_VERSION 300
+#endif
+
+/* Forward target OpenCL version to C headers if necessary */
+#if defined(CL_TARGET_OPENCL_VERSION)
+/* Warn if prior definition of CL_TARGET_OPENCL_VERSION is lower than
+ * requested C++ bindings version */
+#if CL_TARGET_OPENCL_VERSION < CL_HPP_TARGET_OPENCL_VERSION
+# pragma message("CL_TARGET_OPENCL_VERSION is already defined as is lower than CL_HPP_TARGET_OPENCL_VERSION")
+#endif
+#else
+# define CL_TARGET_OPENCL_VERSION CL_HPP_TARGET_OPENCL_VERSION
 #endif
 
 #if !defined(CL_HPP_MINIMUM_OPENCL_VERSION)
 # define CL_HPP_MINIMUM_OPENCL_VERSION 200
 #endif
-#if CL_HPP_MINIMUM_OPENCL_VERSION != 100 && CL_HPP_MINIMUM_OPENCL_VERSION != 110 && CL_HPP_MINIMUM_OPENCL_VERSION != 120 && CL_HPP_MINIMUM_OPENCL_VERSION != 200
-# pragma message("cl2.hpp: CL_HPP_MINIMUM_OPENCL_VERSION is not a valid value (100, 110, 120 or 200). It will be set to 100")
+#if CL_HPP_MINIMUM_OPENCL_VERSION != 100 && \
+    CL_HPP_MINIMUM_OPENCL_VERSION != 110 && \
+    CL_HPP_MINIMUM_OPENCL_VERSION != 120 && \
+    CL_HPP_MINIMUM_OPENCL_VERSION != 200 && \
+    CL_HPP_MINIMUM_OPENCL_VERSION != 210 && \
+    CL_HPP_MINIMUM_OPENCL_VERSION != 220 && \
+    CL_HPP_MINIMUM_OPENCL_VERSION != 300
+# pragma message("opencl.hpp: CL_HPP_MINIMUM_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220 or 300). It will be set to 100")
 # undef CL_HPP_MINIMUM_OPENCL_VERSION
 # define CL_HPP_MINIMUM_OPENCL_VERSION 100
 #endif
@@ -472,6 +490,12 @@
 #if CL_HPP_MINIMUM_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS)
 # define CL_USE_DEPRECATED_OPENCL_2_0_APIS
 #endif
+#if CL_HPP_MINIMUM_OPENCL_VERSION <= 210 && !defined(CL_USE_DEPRECATED_OPENCL_2_1_APIS)
+# define CL_USE_DEPRECATED_OPENCL_2_1_APIS
+#endif
+#if CL_HPP_MINIMUM_OPENCL_VERSION <= 220 && !defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS)
+# define CL_USE_DEPRECATED_OPENCL_2_2_APIS
+#endif
 
 #ifdef _WIN32
 
@@ -495,44 +519,44 @@
 #error Visual studio 2013 or another C++11-supporting compiler required
 #endif
 
-// 
-#if defined(CL_HPP_USE_CL_DEVICE_FISSION) || defined(CL_HPP_USE_CL_SUB_GROUPS_KHR)
-#include <CL/cl_ext.h>
-#endif
-
 #if defined(__APPLE__) || defined(__MACOSX)
 #include <OpenCL/opencl.h>
 #else
 #include <CL/opencl.h>
 #endif // !__APPLE__
 
-#if (__cplusplus >= 201103L)
-#define CL_HPP_NOEXCEPT_ noexcept
-#else
-#define CL_HPP_NOEXCEPT_
-#endif
-
-#if defined(_MSC_VER)
+#if __cplusplus >= 201703L
+# define CL_HPP_DEFINE_STATIC_MEMBER_ inline
+#elif defined(_MSC_VER)
 # define CL_HPP_DEFINE_STATIC_MEMBER_ __declspec(selectany)
+#elif defined(__MINGW32__)
+# define CL_HPP_DEFINE_STATIC_MEMBER_ __attribute__((selectany))
 #else
 # define CL_HPP_DEFINE_STATIC_MEMBER_ __attribute__((weak))
 #endif // !_MSC_VER
 
 // Define deprecated prefixes and suffixes to ensure compilation
 // in case they are not pre-defined
-#if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
-#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED  
-#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
-#if !defined(CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED)
-#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
-#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
-
-#if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED)
-#define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED  
-#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED)
-#if !defined(CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED)
-#define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
-#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED)
+#if !defined(CL_API_PREFIX__VERSION_1_1_DEPRECATED)
+#define CL_API_PREFIX__VERSION_1_1_DEPRECATED
+#endif // #if !defined(CL_API_PREFIX__VERSION_1_1_DEPRECATED)
+#if !defined(CL_API_SUFFIX__VERSION_1_1_DEPRECATED)
+#define CL_API_SUFFIX__VERSION_1_1_DEPRECATED
+#endif // #if !defined(CL_API_SUFFIX__VERSION_1_1_DEPRECATED)
+
+#if !defined(CL_API_PREFIX__VERSION_1_2_DEPRECATED)
+#define CL_API_PREFIX__VERSION_1_2_DEPRECATED
+#endif // #if !defined(CL_API_PREFIX__VERSION_1_2_DEPRECATED)
+#if !defined(CL_API_SUFFIX__VERSION_1_2_DEPRECATED)
+#define CL_API_SUFFIX__VERSION_1_2_DEPRECATED
+#endif // #if !defined(CL_API_SUFFIX__VERSION_1_2_DEPRECATED)
+
+#if !defined(CL_API_PREFIX__VERSION_2_2_DEPRECATED)
+#define CL_API_PREFIX__VERSION_2_2_DEPRECATED
+#endif // #if !defined(CL_API_PREFIX__VERSION_2_2_DEPRECATED)
+#if !defined(CL_API_SUFFIX__VERSION_2_2_DEPRECATED)
+#define CL_API_SUFFIX__VERSION_2_2_DEPRECATED
+#endif // #if !defined(CL_API_SUFFIX__VERSION_2_2_DEPRECATED)
 
 #if !defined(CL_CALLBACK)
 #define CL_CALLBACK
@@ -675,24 +699,27 @@ namespace cl {
  *
  */
 namespace cl {
-    class Memory;
 
-#define CL_HPP_INIT_CL_EXT_FCN_PTR_(name) \
-    if (!pfn_##name) {    \
-    pfn_##name = (PFN_##name) \
-    clGetExtensionFunctionAddress(#name); \
-    if (!pfn_##name) {    \
-    } \
+#define CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(name) \
+    using PFN_##name = name##_fn
+
+#define CL_HPP_INIT_CL_EXT_FCN_PTR_(name)                               \
+    if (!pfn_##name) {                                                  \
+        pfn_##name = (PFN_##name)clGetExtensionFunctionAddress(#name);  \
     }
 
-#define CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, name) \
-    if (!pfn_##name) {    \
-    pfn_##name = (PFN_##name) \
-    clGetExtensionFunctionAddressForPlatform(platform, #name); \
-    if (!pfn_##name) {    \
-    } \
+#define CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, name)            \
+    if (!pfn_##name) {                                                  \
+        pfn_##name = (PFN_##name)                                       \
+            clGetExtensionFunctionAddressForPlatform(platform, #name);  \
     }
 
+#ifdef cl_khr_external_memory
+    enum class ExternalMemoryType : cl_external_memory_handle_type_khr;
+#endif
+
+    class Memory;
+    class Platform;
     class Program;
     class Device;
     class Context;
@@ -701,6 +728,13 @@ namespace cl {
     class Memory;
     class Buffer;
     class Pipe;
+#ifdef cl_khr_semaphore
+    class Semaphore;
+#endif
+#if defined(cl_khr_command_buffer)
+    class CommandBufferKhr;
+    class MutableCommandKhr;
+#endif // cl_khr_command_buffer
 
 #if defined(CL_HPP_ENABLE_EXCEPTIONS)
     /*! \brief Exception class 
@@ -722,18 +756,16 @@ namespace cl {
          *                handling of the exception has concluded.  If set, it
          *                will be returned by what().
          */
-        Error(cl_int err, const char * errStr = NULL) : err_(err), errStr_(errStr)
+        Error(cl_int err, const char * errStr = nullptr) : err_(err), errStr_(errStr)
         {}
 
-        ~Error() throw() {}
-
         /*! \brief Get error string associated with exception
          *
          * \return A memory pointer to the error message string.
          */
-        virtual const char * what() const throw ()
+        const char * what() const noexcept override
         {
-            if (errStr_ == NULL) {
+            if (errStr_ == nullptr) {
                 return "empty";
             }
             else {
@@ -749,7 +781,7 @@ namespace cl {
     };
 #define CL_HPP_ERR_STR_(x) #x
 #else
-#define CL_HPP_ERR_STR_(x) NULL
+#define CL_HPP_ERR_STR_(x) nullptr
 #endif // CL_HPP_ENABLE_EXCEPTIONS
 
 
@@ -758,7 +790,7 @@ namespace detail
 #if defined(CL_HPP_ENABLE_EXCEPTIONS)
 static inline cl_int errHandler (
     cl_int err,
-    const char * errStr = NULL)
+    const char * errStr = nullptr)
 {
     if (err != CL_SUCCESS) {
         throw Error(err, errStr);
@@ -766,7 +798,7 @@ static inline cl_int errHandler (
     return err;
 }
 #else
-static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
+static inline cl_int errHandler (cl_int err, const char * errStr = nullptr)
 {
     (void) errStr; // suppress unused variable warning
     return err;
@@ -792,6 +824,11 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
 #define __GET_KERNEL_ARG_INFO_ERR           CL_HPP_ERR_STR_(clGetKernelArgInfo)
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+#if CL_HPP_TARGET_OPENCL_VERSION >= 210
+#define __GET_KERNEL_SUB_GROUP_INFO_ERR     CL_HPP_ERR_STR_(clGetKernelSubGroupInfo)
+#else
+#define __GET_KERNEL_SUB_GROUP_INFO_ERR     CL_HPP_ERR_STR_(clGetKernelSubGroupInfoKHR)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 210
 #define __GET_KERNEL_WORK_GROUP_INFO_ERR    CL_HPP_ERR_STR_(clGetKernelWorkGroupInfo)
 #define __GET_PROGRAM_INFO_ERR              CL_HPP_ERR_STR_(clGetProgramInfo)
 #define __GET_PROGRAM_BUILD_INFO_ERR        CL_HPP_ERR_STR_(clGetProgramBuildInfo)
@@ -800,6 +837,9 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
 #define __CREATE_CONTEXT_ERR                CL_HPP_ERR_STR_(clCreateContext)
 #define __CREATE_CONTEXT_FROM_TYPE_ERR      CL_HPP_ERR_STR_(clCreateContextFromType)
 #define __GET_SUPPORTED_IMAGE_FORMATS_ERR   CL_HPP_ERR_STR_(clGetSupportedImageFormats)
+#if CL_HPP_TARGET_OPENCL_VERSION >= 300
+#define __SET_CONTEXT_DESCTRUCTOR_CALLBACK_ERR  CL_HPP_ERR_STR_(clSetContextDestructorCallback)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 300
 
 #define __CREATE_BUFFER_ERR                 CL_HPP_ERR_STR_(clCreateBuffer)
 #define __COPY_ERR                          CL_HPP_ERR_STR_(cl::copy)
@@ -823,6 +863,11 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
 #define __SET_KERNEL_ARGS_ERR               CL_HPP_ERR_STR_(clSetKernelArg)
 #define __CREATE_PROGRAM_WITH_SOURCE_ERR    CL_HPP_ERR_STR_(clCreateProgramWithSource)
 #define __CREATE_PROGRAM_WITH_BINARY_ERR    CL_HPP_ERR_STR_(clCreateProgramWithBinary)
+#if CL_HPP_TARGET_OPENCL_VERSION >= 210
+#define __CREATE_PROGRAM_WITH_IL_ERR        CL_HPP_ERR_STR_(clCreateProgramWithIL)
+#else
+#define __CREATE_PROGRAM_WITH_IL_ERR        CL_HPP_ERR_STR_(clCreateProgramWithILKHR)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 210
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
 #define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR    CL_HPP_ERR_STR_(clCreateProgramWithBuiltInKernels)
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
@@ -852,6 +897,10 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
 #define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR  CL_HPP_ERR_STR_(clEnqueueCopyImageToBuffer)
 #define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR  CL_HPP_ERR_STR_(clEnqueueCopyBufferToImage)
 #define __ENQUEUE_MAP_BUFFER_ERR            CL_HPP_ERR_STR_(clEnqueueMapBuffer)
+#define __ENQUEUE_MAP_SVM_ERR               CL_HPP_ERR_STR_(clEnqueueSVMMap)
+#define __ENQUEUE_FILL_SVM_ERR              CL_HPP_ERR_STR_(clEnqueueSVMMemFill)
+#define __ENQUEUE_COPY_SVM_ERR              CL_HPP_ERR_STR_(clEnqueueSVMMemcpy)
+#define __ENQUEUE_UNMAP_SVM_ERR             CL_HPP_ERR_STR_(clEnqueueSVMUnmap)
 #define __ENQUEUE_MAP_IMAGE_ERR             CL_HPP_ERR_STR_(clEnqueueMapImage)
 #define __ENQUEUE_UNMAP_MEM_OBJECT_ERR      CL_HPP_ERR_STR_(clEnqueueUnMapMemObject)
 #define __ENQUEUE_NDRANGE_KERNEL_ERR        CL_HPP_ERR_STR_(clEnqueueNDRangeKernel)
@@ -859,6 +908,11 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
 #define __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR   CL_HPP_ERR_STR_(clEnqueueMigrateMemObjects)
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+#if CL_HPP_TARGET_OPENCL_VERSION >= 210
+#define __ENQUEUE_MIGRATE_SVM_ERR   CL_HPP_ERR_STR_(clEnqueueSVMMigrateMem)
+#define __SET_DEFAULT_DEVICE_COMMAND_QUEUE_ERR   CL_HPP_ERR_STR_(clSetDefaultDeviceCommandQueue)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 210
+
 
 #define __ENQUEUE_ACQUIRE_GL_ERR            CL_HPP_ERR_STR_(clEnqueueAcquireGLObjects)
 #define __ENQUEUE_RELEASE_GL_ERR            CL_HPP_ERR_STR_(clEnqueueReleaseGLObjects)
@@ -866,13 +920,63 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
 #define __CREATE_PIPE_ERR             CL_HPP_ERR_STR_(clCreatePipe)
 #define __GET_PIPE_INFO_ERR           CL_HPP_ERR_STR_(clGetPipeInfo)
 
-
 #define __RETAIN_ERR                        CL_HPP_ERR_STR_(Retain Object)
 #define __RELEASE_ERR                       CL_HPP_ERR_STR_(Release Object)
 #define __FLUSH_ERR                         CL_HPP_ERR_STR_(clFlush)
 #define __FINISH_ERR                        CL_HPP_ERR_STR_(clFinish)
 #define __VECTOR_CAPACITY_ERR               CL_HPP_ERR_STR_(Vector capacity error)
 
+#if CL_HPP_TARGET_OPENCL_VERSION >= 210
+#define __GET_HOST_TIMER_ERR           CL_HPP_ERR_STR_(clGetHostTimer)
+#define __GET_DEVICE_AND_HOST_TIMER_ERR           CL_HPP_ERR_STR_(clGetDeviceAndHostTimer)
+#endif
+#if CL_HPP_TARGET_OPENCL_VERSION >= 220
+#define __SET_PROGRAM_RELEASE_CALLBACK_ERR          CL_HPP_ERR_STR_(clSetProgramReleaseCallback)
+#define __SET_PROGRAM_SPECIALIZATION_CONSTANT_ERR   CL_HPP_ERR_STR_(clSetProgramSpecializationConstant)
+#endif
+
+#ifdef cl_khr_external_memory
+#define __ENQUEUE_ACQUIRE_EXTERNAL_MEMORY_ERR       CL_HPP_ERR_STR_(clEnqueueAcquireExternalMemObjectsKHR)
+#define __ENQUEUE_RELEASE_EXTERNAL_MEMORY_ERR       CL_HPP_ERR_STR_(clEnqueueReleaseExternalMemObjectsKHR)
+#endif
+
+#ifdef cl_khr_semaphore
+#define __GET_SEMAPHORE_KHR_INFO_ERR                CL_HPP_ERR_STR_(clGetSemaphoreInfoKHR)
+#define __CREATE_SEMAPHORE_KHR_WITH_PROPERTIES_ERR  CL_HPP_ERR_STR_(clCreateSemaphoreWithPropertiesKHR)
+#define __ENQUEUE_WAIT_SEMAPHORE_KHR_ERR            CL_HPP_ERR_STR_(clEnqueueWaitSemaphoresKHR)
+#define __ENQUEUE_SIGNAL_SEMAPHORE_KHR_ERR          CL_HPP_ERR_STR_(clEnqueueSignalSemaphoresKHR)
+#define __RETAIN_SEMAPHORE_KHR_ERR                  CL_HPP_ERR_STR_(clRetainSemaphoreKHR)
+#define __RELEASE_SEMAPHORE_KHR_ERR                 CL_HPP_ERR_STR_(clReleaseSemaphoreKHR)
+#endif
+
+#ifdef cl_khr_external_semaphore
+#define __GET_SEMAPHORE_HANDLE_FOR_TYPE_KHR_ERR         CL_HPP_ERR_STR_(clGetSemaphoreHandleForTypeKHR)
+#endif // cl_khr_external_semaphore
+
+#if defined(cl_khr_command_buffer)
+#define __CREATE_COMMAND_BUFFER_KHR_ERR             CL_HPP_ERR_STR_(clCreateCommandBufferKHR)
+#define __GET_COMMAND_BUFFER_INFO_KHR_ERR           CL_HPP_ERR_STR_(clGetCommandBufferInfoKHR)
+#define __FINALIZE_COMMAND_BUFFER_KHR_ERR           CL_HPP_ERR_STR_(clFinalizeCommandBufferKHR)
+#define __ENQUEUE_COMMAND_BUFFER_KHR_ERR            CL_HPP_ERR_STR_(clEnqueueCommandBufferKHR)
+#define __COMMAND_BARRIER_WITH_WAIT_LIST_KHR_ERR    CL_HPP_ERR_STR_(clCommandBarrierWithWaitListKHR)
+#define __COMMAND_COPY_BUFFER_KHR_ERR               CL_HPP_ERR_STR_(clCommandCopyBufferKHR)
+#define __COMMAND_COPY_BUFFER_RECT_KHR_ERR          CL_HPP_ERR_STR_(clCommandCopyBufferRectKHR)
+#define __COMMAND_COPY_BUFFER_TO_IMAGE_KHR_ERR      CL_HPP_ERR_STR_(clCommandCopyBufferToImageKHR)
+#define __COMMAND_COPY_IMAGE_KHR_ERR                CL_HPP_ERR_STR_(clCommandCopyImageKHR)
+#define __COMMAND_COPY_IMAGE_TO_BUFFER_KHR_ERR      CL_HPP_ERR_STR_(clCommandCopyImageToBufferKHR)
+#define __COMMAND_FILL_BUFFER_KHR_ERR               CL_HPP_ERR_STR_(clCommandFillBufferKHR)
+#define __COMMAND_FILL_IMAGE_KHR_ERR                CL_HPP_ERR_STR_(clCommandFillImageKHR)
+#define __COMMAND_NDRANGE_KERNEL_KHR_ERR            CL_HPP_ERR_STR_(clCommandNDRangeKernelKHR)
+#define __UPDATE_MUTABLE_COMMANDS_KHR_ERR           CL_HPP_ERR_STR_(clUpdateMutableCommandsKHR)
+#define __GET_MUTABLE_COMMAND_INFO_KHR_ERR          CL_HPP_ERR_STR_(clGetMutableCommandInfoKHR)
+#define __RETAIN_COMMAND_BUFFER_KHR_ERR             CL_HPP_ERR_STR_(clRetainCommandBufferKHR)
+#define __RELEASE_COMMAND_BUFFER_KHR_ERR            CL_HPP_ERR_STR_(clReleaseCommandBufferKHR)
+#endif // cl_khr_command_buffer
+
+#if defined(cl_ext_image_requirements_info)
+#define __GET_IMAGE_REQUIREMENT_INFO_EXT_ERR            CL_HPP_ERR_STR_(clGetImageRequirementsInfoEXT)
+#endif //cl_ext_image_requirements_info
+
 /**
  * CL 1.2 version that uses device fission.
  */
@@ -913,9 +1017,94 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
 #define __ENQUEUE_BARRIER_WAIT_LIST_ERR               CL_HPP_ERR_STR_(clEnqueueBarrierWithWaitList)
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
 
+#if CL_HPP_TARGET_OPENCL_VERSION >= 210
+#define __CLONE_KERNEL_ERR     CL_HPP_ERR_STR_(clCloneKernel)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 210
+
 #endif // CL_HPP_USER_OVERRIDE_ERROR_STRINGS
 //! \endcond
 
+#ifdef cl_khr_external_memory
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clEnqueueAcquireExternalMemObjectsKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clEnqueueReleaseExternalMemObjectsKHR);
+
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clEnqueueAcquireExternalMemObjectsKHR pfn_clEnqueueAcquireExternalMemObjectsKHR = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clEnqueueReleaseExternalMemObjectsKHR pfn_clEnqueueReleaseExternalMemObjectsKHR = nullptr;
+#endif // cl_khr_external_memory
+
+#ifdef cl_khr_semaphore
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCreateSemaphoreWithPropertiesKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clReleaseSemaphoreKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clRetainSemaphoreKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clEnqueueWaitSemaphoresKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clEnqueueSignalSemaphoresKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clGetSemaphoreInfoKHR);
+
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCreateSemaphoreWithPropertiesKHR pfn_clCreateSemaphoreWithPropertiesKHR  = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clReleaseSemaphoreKHR              pfn_clReleaseSemaphoreKHR               = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clRetainSemaphoreKHR               pfn_clRetainSemaphoreKHR                = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clEnqueueWaitSemaphoresKHR         pfn_clEnqueueWaitSemaphoresKHR          = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clEnqueueSignalSemaphoresKHR       pfn_clEnqueueSignalSemaphoresKHR        = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clGetSemaphoreInfoKHR              pfn_clGetSemaphoreInfoKHR               = nullptr;
+#endif // cl_khr_semaphore
+
+#ifdef cl_khr_external_semaphore
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clGetSemaphoreHandleForTypeKHR);
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clGetSemaphoreHandleForTypeKHR     pfn_clGetSemaphoreHandleForTypeKHR      = nullptr;
+#endif // cl_khr_external_semaphore
+
+#if defined(cl_khr_command_buffer)
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCreateCommandBufferKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clFinalizeCommandBufferKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clRetainCommandBufferKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clReleaseCommandBufferKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clGetCommandBufferInfoKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clEnqueueCommandBufferKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandBarrierWithWaitListKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandCopyBufferKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandCopyBufferRectKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandCopyBufferToImageKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandCopyImageKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandCopyImageToBufferKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandFillBufferKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandFillImageKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandNDRangeKernelKHR);
+
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCreateCommandBufferKHR pfn_clCreateCommandBufferKHR               = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clFinalizeCommandBufferKHR pfn_clFinalizeCommandBufferKHR           = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clRetainCommandBufferKHR pfn_clRetainCommandBufferKHR               = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clReleaseCommandBufferKHR pfn_clReleaseCommandBufferKHR             = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clGetCommandBufferInfoKHR pfn_clGetCommandBufferInfoKHR             = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clEnqueueCommandBufferKHR pfn_clEnqueueCommandBufferKHR             = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandBarrierWithWaitListKHR pfn_clCommandBarrierWithWaitListKHR = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandCopyBufferKHR pfn_clCommandCopyBufferKHR                   = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandCopyBufferRectKHR pfn_clCommandCopyBufferRectKHR           = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandCopyBufferToImageKHR pfn_clCommandCopyBufferToImageKHR     = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandCopyImageKHR pfn_clCommandCopyImageKHR                     = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandCopyImageToBufferKHR pfn_clCommandCopyImageToBufferKHR     = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandFillBufferKHR pfn_clCommandFillBufferKHR                   = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandFillImageKHR pfn_clCommandFillImageKHR                     = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandNDRangeKernelKHR pfn_clCommandNDRangeKernelKHR             = nullptr;
+#endif /* cl_khr_command_buffer */
+
+#if defined(cl_khr_command_buffer_mutable_dispatch)
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clUpdateMutableCommandsKHR);
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clGetMutableCommandInfoKHR);
+
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clUpdateMutableCommandsKHR pfn_clUpdateMutableCommandsKHR           = nullptr;
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clGetMutableCommandInfoKHR pfn_clGetMutableCommandInfoKHR           = nullptr;
+#endif /* cl_khr_command_buffer_mutable_dispatch */
+
+#if defined(cl_ext_image_requirements_info)
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clGetImageRequirementsInfoEXT);
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clGetImageRequirementsInfoEXT pfn_clGetImageRequirementsInfoEXT  = nullptr;
+#endif
+
+#if defined(cl_ext_device_fission)
+CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCreateSubDevicesEXT);
+CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCreateSubDevicesEXT
+    pfn_clCreateSubDevicesEXT = nullptr;
+#endif
 
 namespace detail {
 
@@ -926,7 +1115,7 @@ namespace detail {
 template<typename Functor, typename T>
 inline cl_int getInfoHelper(Functor f, cl_uint name, T* param, long)
 {
-    return f(name, sizeof(T), param, NULL);
+    return f(name, sizeof(T), param, nullptr);
 }
 
 // Specialized for getInfo<CL_PROGRAM_BINARIES>
@@ -947,14 +1136,13 @@ inline cl_int getInfoHelper(Func f, cl_uint name, vector<vector<unsigned char>>*
             binariesPointers[i] = (*param)[i].data();
         }
 
-        cl_int err = f(name, numBinaries * sizeof(unsigned char*), binariesPointers.data(), NULL);
+        cl_int err = f(name, numBinaries * sizeof(unsigned char*), binariesPointers.data(), nullptr);
 
         if (err != CL_SUCCESS) {
             return err;
         }
     }
 
-
     return CL_SUCCESS;
 }
 
@@ -963,7 +1151,7 @@ template <typename Func, typename T>
 inline cl_int getInfoHelper(Func f, cl_uint name, vector<T>* param, long)
 {
     size_type required;
-    cl_int err = f(name, 0, NULL, &required);
+    cl_int err = f(name, 0, nullptr, &required);
     if (err != CL_SUCCESS) {
         return err;
     }
@@ -971,7 +1159,7 @@ inline cl_int getInfoHelper(Func f, cl_uint name, vector<T>* param, long)
 
     // Temporary to avoid changing param on an error
     vector<T> localData(elements);
-    err = f(name, required, localData.data(), NULL);
+    err = f(name, required, localData.data(), nullptr);
     if (err != CL_SUCCESS) {
         return err;
     }
@@ -993,7 +1181,7 @@ inline cl_int getInfoHelper(
     Func f, cl_uint name, vector<T>* param, int, typename T::cl_type = 0)
 {
     size_type required;
-    cl_int err = f(name, 0, NULL, &required);
+    cl_int err = f(name, 0, nullptr, &required);
     if (err != CL_SUCCESS) {
         return err;
     }
@@ -1001,7 +1189,7 @@ inline cl_int getInfoHelper(
     const size_type elements = required / sizeof(typename T::cl_type);
 
     vector<typename T::cl_type> value(elements);
-    err = f(name, required, value.data(), NULL);
+    err = f(name, required, value.data(), nullptr);
     if (err != CL_SUCCESS) {
         return err;
     }
@@ -1024,7 +1212,7 @@ template <typename Func>
 inline cl_int getInfoHelper(Func f, cl_uint name, string* param, long)
 {
     size_type required;
-    cl_int err = f(name, 0, NULL, &required);
+    cl_int err = f(name, 0, nullptr, &required);
     if (err != CL_SUCCESS) {
         return err;
     }
@@ -1033,12 +1221,12 @@ inline cl_int getInfoHelper(Func f, cl_uint name, string* param, long)
     // a char vector does not
     if (required > 0) {
         vector<char> value(required);
-        err = f(name, required, value.data(), NULL);
+        err = f(name, required, value.data(), nullptr);
         if (err != CL_SUCCESS) {
             return err;
         }
         if (param) {
-            param->assign(begin(value), prev(end(value)));
+            param->assign(value.begin(), value.end() - 1);
         }
     }
     else if (param) {
@@ -1052,7 +1240,7 @@ template <typename Func, size_type N>
 inline cl_int getInfoHelper(Func f, cl_uint name, array<size_type, N>* param, long)
 {
     size_type required;
-    cl_int err = f(name, 0, NULL, &required);
+    cl_int err = f(name, 0, nullptr, &required);
     if (err != CL_SUCCESS) {
         return err;
     }
@@ -1060,7 +1248,7 @@ inline cl_int getInfoHelper(Func f, cl_uint name, array<size_type, N>* param, lo
     size_type elements = required / sizeof(size_type);
     vector<size_type> value(elements, 0);
 
-    err = f(name, required, value.data(), NULL);
+    err = f(name, required, value.data(), nullptr);
     if (err != CL_SUCCESS) {
         return err;
     }
@@ -1089,12 +1277,12 @@ template<typename Func, typename T>
 inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_type = 0)
 {
     typename T::cl_type value;
-    cl_int err = f(name, sizeof(value), &value, NULL);
+    cl_int err = f(name, sizeof(value), &value, nullptr);
     if (err != CL_SUCCESS) {
         return err;
     }
     *param = value;
-    if (value != NULL)
+    if (value != nullptr)
     {
         err = param->retain();
         if (err != CL_SUCCESS) {
@@ -1155,7 +1343,7 @@ inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_
     F(cl_device_info, CL_DEVICE_AVAILABLE, cl_bool) \
     F(cl_device_info, CL_DEVICE_COMPILER_AVAILABLE, cl_bool) \
     F(cl_device_info, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities) \
-    F(cl_device_info, CL_DEVICE_PLATFORM, cl_platform_id) \
+    F(cl_device_info, CL_DEVICE_PLATFORM, cl::Platform) \
     F(cl_device_info, CL_DEVICE_NAME, string) \
     F(cl_device_info, CL_DEVICE_VENDOR, string) \
     F(cl_device_info, CL_DRIVER_VERSION, string) \
@@ -1261,13 +1449,20 @@ inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_
     F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, string) \
     F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_QUALIFIER, cl_kernel_arg_type_qualifier) \
     \
+    F(cl_kernel_work_group_info, CL_KERNEL_GLOBAL_WORK_SIZE, cl::detail::size_t_array) \
+    \
+    F(cl_device_info, CL_DEVICE_LINKER_AVAILABLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, size_type) \
+    F(cl_device_info, CL_DEVICE_IMAGE_MAX_ARRAY_SIZE, size_type) \
     F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl::Device) \
+    F(cl_device_info, CL_DEVICE_PARTITION_MAX_SUB_DEVICES, cl_uint) \
     F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, cl::vector<cl_device_partition_property>) \
     F(cl_device_info, CL_DEVICE_PARTITION_TYPE, cl::vector<cl_device_partition_property>)  \
     F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \
-    F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, size_type) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, cl_bool) \
     F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \
     F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, string) \
+    F(cl_device_info, CL_DEVICE_PRINTF_BUFFER_SIZE, size_type) \
     \
     F(cl_image_info, CL_IMAGE_ARRAY_SIZE, size_type) \
     F(cl_image_info, CL_IMAGE_NUM_MIP_LEVELS, cl_uint) \
@@ -1287,19 +1482,135 @@ inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_
     F(cl_device_info, CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT, cl_uint) \
     F(cl_device_info, CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT, cl_uint) \
     F(cl_device_info, CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_IMAGE_PITCH_ALIGNMENT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS, cl_uint ) \
+    F(cl_device_info, CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE, size_type ) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE, size_type ) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_COMPLETE, cl_ulong) \
+    F(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM, cl_bool) \
+    F(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_SVM_PTRS, void**) \
     F(cl_command_queue_info, CL_QUEUE_SIZE, cl_uint) \
     F(cl_mem_info, CL_MEM_USES_SVM_POINTER, cl_bool) \
     F(cl_program_build_info, CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, size_type) \
     F(cl_pipe_info, CL_PIPE_PACKET_SIZE, cl_uint) \
     F(cl_pipe_info, CL_PIPE_MAX_PACKETS, cl_uint)
 
-#define CL_HPP_PARAM_NAME_DEVICE_FISSION_(F) \
-    F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \
+#define CL_HPP_PARAM_NAME_INFO_SUBGROUP_KHR_(F) \
+    F(cl_kernel_sub_group_info, CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR, size_type) \
+    F(cl_kernel_sub_group_info, CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR, size_type)
+
+#define CL_HPP_PARAM_NAME_INFO_IL_KHR_(F) \
+    F(cl_device_info, CL_DEVICE_IL_VERSION_KHR, string) \
+    F(cl_program_info, CL_PROGRAM_IL_KHR, cl::vector<unsigned char>)
+
+#define CL_HPP_PARAM_NAME_INFO_2_1_(F) \
+    F(cl_platform_info, CL_PLATFORM_HOST_TIMER_RESOLUTION, cl_ulong) \
+    F(cl_program_info, CL_PROGRAM_IL, cl::vector<unsigned char>) \
+    F(cl_device_info, CL_DEVICE_MAX_NUM_SUB_GROUPS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_IL_VERSION, string) \
+    F(cl_device_info, CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS, cl_bool) \
+    F(cl_command_queue_info, CL_QUEUE_DEVICE_DEFAULT, cl::DeviceCommandQueue) \
+    F(cl_kernel_sub_group_info, CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE, size_type) \
+    F(cl_kernel_sub_group_info, CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE, size_type) \
+    F(cl_kernel_sub_group_info, CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT, cl::detail::size_t_array) \
+    F(cl_kernel_sub_group_info, CL_KERNEL_MAX_NUM_SUB_GROUPS, size_type) \
+    F(cl_kernel_sub_group_info, CL_KERNEL_COMPILE_NUM_SUB_GROUPS, size_type)
+
+#define CL_HPP_PARAM_NAME_INFO_2_2_(F) \
+    F(cl_program_info, CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT, cl_bool) \
+    F(cl_program_info, CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT, cl_bool)
+
+#define CL_HPP_PARAM_NAME_DEVICE_FISSION_EXT_(F) \
+    F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl::Device) \
     F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, cl::vector<cl_device_partition_property_ext>) \
     F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, cl::vector<cl_device_partition_property_ext>) \
     F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \
     F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, cl::vector<cl_device_partition_property_ext>)
 
+#define CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_CL3_SHARED_(F) \
+    F(cl_platform_info, CL_PLATFORM_NUMERIC_VERSION_KHR, cl_version_khr) \
+    F(cl_platform_info, CL_PLATFORM_EXTENSIONS_WITH_VERSION_KHR, cl::vector<cl_name_version_khr>) \
+    \
+    F(cl_device_info, CL_DEVICE_NUMERIC_VERSION_KHR, cl_version_khr) \
+    F(cl_device_info, CL_DEVICE_EXTENSIONS_WITH_VERSION_KHR, cl::vector<cl_name_version_khr>) \
+    F(cl_device_info, CL_DEVICE_ILS_WITH_VERSION_KHR, cl::vector<cl_name_version_khr>) \
+    F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION_KHR, cl::vector<cl_name_version_khr>)
+
+#define CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_KHRONLY_(F) \
+    F(cl_device_info, CL_DEVICE_OPENCL_C_NUMERIC_VERSION_KHR, cl_version_khr)
+
+// Note: the query for CL_SEMAPHORE_DEVICE_HANDLE_LIST_KHR is handled specially!
+#define CL_HPP_PARAM_NAME_CL_KHR_SEMAPHORE_(F) \
+    F(cl_semaphore_info_khr, CL_SEMAPHORE_CONTEXT_KHR, cl::Context) \
+    F(cl_semaphore_info_khr, CL_SEMAPHORE_REFERENCE_COUNT_KHR, cl_uint) \
+    F(cl_semaphore_info_khr, CL_SEMAPHORE_PROPERTIES_KHR, cl::vector<cl_semaphore_properties_khr>) \
+    F(cl_semaphore_info_khr, CL_SEMAPHORE_TYPE_KHR, cl_semaphore_type_khr) \
+    F(cl_semaphore_info_khr, CL_SEMAPHORE_PAYLOAD_KHR, cl_semaphore_payload_khr) \
+    F(cl_platform_info, CL_PLATFORM_SEMAPHORE_TYPES_KHR,  cl::vector<cl_semaphore_type_khr>) \
+    F(cl_device_info, CL_DEVICE_SEMAPHORE_TYPES_KHR,      cl::vector<cl_semaphore_type_khr>) \
+
+#define CL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_MEMORY_(F) \
+    F(cl_device_info, CL_DEVICE_EXTERNAL_MEMORY_IMPORT_HANDLE_TYPES_KHR, cl::vector<cl::ExternalMemoryType>) \
+    F(cl_platform_info, CL_PLATFORM_EXTERNAL_MEMORY_IMPORT_HANDLE_TYPES_KHR, cl::vector<cl::ExternalMemoryType>)
+
+#define CL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_SEMAPHORE_(F) \
+    F(cl_platform_info, CL_PLATFORM_SEMAPHORE_IMPORT_HANDLE_TYPES_KHR,  cl::vector<cl_external_semaphore_handle_type_khr>) \
+    F(cl_platform_info, CL_PLATFORM_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR,  cl::vector<cl_external_semaphore_handle_type_khr>) \
+    F(cl_device_info, CL_DEVICE_SEMAPHORE_IMPORT_HANDLE_TYPES_KHR,      cl::vector<cl_external_semaphore_handle_type_khr>) \
+    F(cl_device_info, CL_DEVICE_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR,      cl::vector<cl_external_semaphore_handle_type_khr>) \
+    F(cl_semaphore_info_khr, CL_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR,      cl::vector<cl_external_semaphore_handle_type_khr>) \
+
+#define CL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_SEMAPHORE_DX_FENCE_EXT(F) \
+    F(cl_external_semaphore_handle_type_khr, CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR, void*) \
+
+#define CL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_SEMAPHORE_OPAQUE_FD_EXT(F) \
+    F(cl_external_semaphore_handle_type_khr, CL_SEMAPHORE_HANDLE_OPAQUE_FD_KHR, int) \
+
+#define CL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_SEMAPHORE_SYNC_FD_EXT(F) \
+    F(cl_external_semaphore_handle_type_khr, CL_SEMAPHORE_HANDLE_SYNC_FD_KHR, int) \
+
+#define CL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_SEMAPHORE_WIN32_EXT(F) \
+    F(cl_external_semaphore_handle_type_khr, CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KHR, void*) \
+    F(cl_external_semaphore_handle_type_khr, CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KMT_KHR, void*) \
+
+#define CL_HPP_PARAM_NAME_INFO_3_0_(F) \
+    F(cl_platform_info, CL_PLATFORM_NUMERIC_VERSION, cl_version) \
+    F(cl_platform_info, CL_PLATFORM_EXTENSIONS_WITH_VERSION, cl::vector<cl_name_version>) \
+    \
+    F(cl_device_info, CL_DEVICE_NUMERIC_VERSION, cl_version) \
+    F(cl_device_info, CL_DEVICE_EXTENSIONS_WITH_VERSION, cl::vector<cl_name_version>) \
+    F(cl_device_info, CL_DEVICE_ILS_WITH_VERSION, cl::vector<cl_name_version>) \
+    F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION, cl::vector<cl_name_version>) \
+    F(cl_device_info, CL_DEVICE_ATOMIC_MEMORY_CAPABILITIES, cl_device_atomic_capabilities) \
+    F(cl_device_info, CL_DEVICE_ATOMIC_FENCE_CAPABILITIES, cl_device_atomic_capabilities) \
+    F(cl_device_info, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_OPENCL_C_ALL_VERSIONS, cl::vector<cl_name_version>) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, size_type) \
+    F(cl_device_info, CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_OPENCL_C_FEATURES, cl::vector<cl_name_version>) \
+    F(cl_device_info, CL_DEVICE_DEVICE_ENQUEUE_CAPABILITIES, cl_device_device_enqueue_capabilities) \
+    F(cl_device_info, CL_DEVICE_PIPE_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_LATEST_CONFORMANCE_VERSION_PASSED, string) \
+    \
+    F(cl_command_queue_info, CL_QUEUE_PROPERTIES_ARRAY, cl::vector<cl_queue_properties>) \
+    F(cl_mem_info, CL_MEM_PROPERTIES, cl::vector<cl_mem_properties>) \
+    F(cl_pipe_info, CL_PIPE_PROPERTIES, cl::vector<cl_pipe_properties>) \
+    F(cl_sampler_info, CL_SAMPLER_PROPERTIES, cl::vector<cl_sampler_properties>) \
+
+#define CL_HPP_PARAM_NAME_CL_IMAGE_REQUIREMENTS_EXT(F) \
+    F(cl_image_requirements_info_ext, CL_IMAGE_REQUIREMENTS_ROW_PITCH_ALIGNMENT_EXT, size_type) \
+    F(cl_image_requirements_info_ext, CL_IMAGE_REQUIREMENTS_BASE_ADDRESS_ALIGNMENT_EXT, size_type) \
+    F(cl_image_requirements_info_ext, CL_IMAGE_REQUIREMENTS_SIZE_EXT, size_type) \
+    F(cl_image_requirements_info_ext, CL_IMAGE_REQUIREMENTS_MAX_WIDTH_EXT, cl_uint) \
+    F(cl_image_requirements_info_ext, CL_IMAGE_REQUIREMENTS_MAX_HEIGHT_EXT, cl_uint) \
+    F(cl_image_requirements_info_ext, CL_IMAGE_REQUIREMENTS_MAX_DEPTH_EXT, cl_uint) \
+    F(cl_image_requirements_info_ext, CL_IMAGE_REQUIREMENTS_MAX_ARRAY_SIZE_EXT, cl_uint) \
+
+#define CL_HPP_PARAM_NAME_CL_IMAGE_REQUIREMENTS_SLICE_PITCH_ALIGNMENT_EXT(F) \
+    F(cl_image_requirements_info_ext, CL_IMAGE_REQUIREMENTS_SLICE_PITCH_ALIGNMENT_EXT, size_type) \
+
 template <typename enum_type, cl_int Name>
 struct param_traits {};
 
@@ -1318,10 +1629,27 @@ CL_HPP_PARAM_NAME_INFO_1_1_(CL_HPP_DECLARE_PARAM_TRAITS_)
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
 CL_HPP_PARAM_NAME_INFO_1_2_(CL_HPP_DECLARE_PARAM_TRAITS_)
-#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
 #if CL_HPP_TARGET_OPENCL_VERSION >= 200
 CL_HPP_PARAM_NAME_INFO_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
-#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
+#if CL_HPP_TARGET_OPENCL_VERSION >= 210
+CL_HPP_PARAM_NAME_INFO_2_1_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 210
+#if CL_HPP_TARGET_OPENCL_VERSION >= 220
+CL_HPP_PARAM_NAME_INFO_2_2_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 220
+#if CL_HPP_TARGET_OPENCL_VERSION >= 300
+CL_HPP_PARAM_NAME_INFO_3_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 300
+
+#if defined(cl_khr_subgroups) && CL_HPP_TARGET_OPENCL_VERSION < 210
+CL_HPP_PARAM_NAME_INFO_SUBGROUP_KHR_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // #if defined(cl_khr_subgroups) && CL_HPP_TARGET_OPENCL_VERSION < 210
+
+#if defined(cl_khr_il_program) && CL_HPP_TARGET_OPENCL_VERSION < 210
+CL_HPP_PARAM_NAME_INFO_IL_KHR_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // #if defined(cl_khr_il_program) && CL_HPP_TARGET_OPENCL_VERSION < 210
 
 
 // Flags deprecated in OpenCL 2.0
@@ -1346,9 +1674,84 @@ CL_HPP_PARAM_NAME_INFO_1_1_DEPRECATED_IN_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
 CL_HPP_PARAM_NAME_INFO_1_2_DEPRECATED_IN_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
 #endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200
 
-#if defined(CL_HPP_USE_CL_DEVICE_FISSION)
-CL_HPP_PARAM_NAME_DEVICE_FISSION_(CL_HPP_DECLARE_PARAM_TRAITS_);
-#endif // CL_HPP_USE_CL_DEVICE_FISSION
+#if defined(cl_ext_device_fission)
+CL_HPP_PARAM_NAME_DEVICE_FISSION_EXT_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // cl_ext_device_fission
+
+#if defined(cl_khr_extended_versioning)
+#if CL_HPP_TARGET_OPENCL_VERSION < 300
+CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_CL3_SHARED_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // CL_HPP_TARGET_OPENCL_VERSION < 300
+CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_KHRONLY_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // cl_khr_extended_versioning
+
+#if defined(cl_khr_semaphore)
+CL_HPP_PARAM_NAME_CL_KHR_SEMAPHORE_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#if defined(CL_SEMAPHORE_DEVICE_HANDLE_LIST_KHR)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_semaphore_info_khr, CL_SEMAPHORE_DEVICE_HANDLE_LIST_KHR, cl::vector<cl::Device>)
+#endif // defined(CL_SEMAPHORE_DEVICE_HANDLE_LIST_KHR)
+#endif // defined(cl_khr_semaphore)
+
+#ifdef cl_khr_external_memory
+CL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_MEMORY_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // cl_khr_external_memory
+
+#if defined(cl_khr_external_semaphore)
+CL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_SEMAPHORE_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // cl_khr_external_semaphore
+
+#if defined(cl_khr_external_semaphore_dx_fence)
+CL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_SEMAPHORE_DX_FENCE_EXT(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // cl_khr_external_semaphore_dx_fence
+#if defined(cl_khr_external_semaphore_opaque_fd)
+CL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_SEMAPHORE_OPAQUE_FD_EXT(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // cl_khr_external_semaphore_opaque_fd
+#if defined(cl_khr_external_semaphore_sync_fd)
+CL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_SEMAPHORE_SYNC_FD_EXT(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // cl_khr_external_semaphore_sync_fd
+#if defined(cl_khr_external_semaphore_win32)
+CL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_SEMAPHORE_WIN32_EXT(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // cl_khr_external_semaphore_win32
+
+#if defined(cl_khr_device_uuid)
+using uuid_array = array<cl_uchar, CL_UUID_SIZE_KHR>;
+using luid_array = array<cl_uchar, CL_LUID_SIZE_KHR>;
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_UUID_KHR, uuid_array)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DRIVER_UUID_KHR, uuid_array)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LUID_VALID_KHR, cl_bool)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LUID_KHR, luid_array)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_NODE_MASK_KHR, cl_uint)
+#endif
+
+#if defined(cl_khr_pci_bus_info)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_PCI_BUS_INFO_KHR, cl_device_pci_bus_info_khr)
+#endif
+
+// Note: some headers do not define cl_khr_image2d_from_buffer
+#if CL_HPP_TARGET_OPENCL_VERSION < 200
+#if defined(CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR, cl_uint)
+#endif
+#if defined(CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR, cl_uint)
+#endif
+#endif // CL_HPP_TARGET_OPENCL_VERSION < 200
+
+#if defined(cl_khr_integer_dot_product)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGER_DOT_PRODUCT_CAPABILITIES_KHR, cl_device_integer_dot_product_capabilities_khr)
+#if defined(CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR, cl_device_integer_dot_product_acceleration_properties_khr)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_4x8BIT_PACKED_KHR, cl_device_integer_dot_product_acceleration_properties_khr)
+#endif // defined(CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR)
+#endif // defined(cl_khr_integer_dot_product)
+
+#if defined(cl_ext_image_requirements_info)
+CL_HPP_PARAM_NAME_CL_IMAGE_REQUIREMENTS_EXT(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // cl_ext_image_requirements_info
+
+#if defined(cl_ext_image_from_buffer)
+CL_HPP_PARAM_NAME_CL_IMAGE_REQUIREMENTS_SLICE_PITCH_ALIGNMENT_EXT(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // cl_ext_image_from_buffer
 
 #ifdef CL_PLATFORM_ICD_SUFFIX_KHR
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, string)
@@ -1357,7 +1760,6 @@ CL_HPP_DECLARE_PARAM_TRAITS_(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, strin
 #ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong)
 #endif
-
 #ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, vector<size_type>)
 #endif
@@ -1388,6 +1790,40 @@ CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUT
 #ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint)
 #endif
+#ifdef CL_DEVICE_BOARD_NAME_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_BOARD_NAME_AMD, string)
+#endif
+
+#ifdef CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM, cl_ulong)
+#endif
+#ifdef CL_DEVICE_JOB_SLOTS_ARM
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_JOB_SLOTS_ARM, cl_uint)
+#endif
+#ifdef CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM, cl_bitfield)
+#endif
+#ifdef CL_DEVICE_SUPPORTED_REGISTER_ALLOCATIONS_ARM
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SUPPORTED_REGISTER_ALLOCATIONS_ARM, vector<cl_uint>)
+#endif
+#ifdef CL_DEVICE_MAX_WARP_COUNT_ARM
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_MAX_WARP_COUNT_ARM, cl_uint)
+#endif
+#ifdef CL_KERNEL_MAX_WARP_COUNT_ARM
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_info, CL_KERNEL_MAX_WARP_COUNT_ARM, cl_uint)
+#endif
+#ifdef CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_ARM
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_ARM, cl_uint)
+#endif
+#ifdef CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM, cl_int)
+#endif
+#ifdef CL_KERNEL_EXEC_INFO_WARP_COUNT_LIMIT_ARM
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_WARP_COUNT_LIMIT_ARM, cl_uint)
+#endif
+#ifdef CL_KERNEL_EXEC_INFO_COMPUTE_UNIT_MAX_QUEUED_BATCHES_ARM
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_COMPUTE_UNIT_MAX_QUEUED_BATCHES_ARM, cl_uint)
+#endif
 
 #ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint)
@@ -1411,6 +1847,32 @@ CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, c
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGRATED_MEMORY_NV, cl_bool)
 #endif
 
+#if defined(cl_khr_command_buffer)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMMAND_BUFFER_CAPABILITIES_KHR, cl_device_command_buffer_capabilities_khr)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMMAND_BUFFER_REQUIRED_QUEUE_PROPERTIES_KHR, cl_command_buffer_properties_khr)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_command_buffer_info_khr, CL_COMMAND_BUFFER_QUEUES_KHR, cl::vector<CommandQueue>)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_command_buffer_info_khr, CL_COMMAND_BUFFER_NUM_QUEUES_KHR, cl_uint)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_command_buffer_info_khr, CL_COMMAND_BUFFER_REFERENCE_COUNT_KHR, cl_uint)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_command_buffer_info_khr, CL_COMMAND_BUFFER_STATE_KHR, cl_command_buffer_state_khr)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_command_buffer_info_khr, CL_COMMAND_BUFFER_PROPERTIES_ARRAY_KHR, cl::vector<cl_command_buffer_properties_khr>)
+#endif /* cl_khr_command_buffer */
+
+#if defined(cl_khr_command_buffer_mutable_dispatch)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_COMMAND_COMMAND_QUEUE_KHR, CommandQueue)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_COMMAND_COMMAND_BUFFER_KHR, CommandBufferKhr)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_COMMAND_COMMAND_TYPE_KHR, cl_command_type)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_DISPATCH_PROPERTIES_ARRAY_KHR, cl::vector<cl_ndrange_kernel_command_properties_khr>)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_DISPATCH_KERNEL_KHR, cl_kernel)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_DISPATCH_DIMENSIONS_KHR, cl_uint)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_DISPATCH_GLOBAL_WORK_OFFSET_KHR, cl::vector<size_type>)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_DISPATCH_GLOBAL_WORK_SIZE_KHR, cl::vector<size_type>)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_DISPATCH_LOCAL_WORK_SIZE_KHR, cl::vector<size_type>)
+#endif /* cl_khr_command_buffer_mutable_dispatch */
+
+#if defined(cl_khr_kernel_clock)
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_KERNEL_CLOCK_CAPABILITIES_KHR, cl_device_kernel_clock_capabilities_khr)
+#endif /* cl_khr_kernel_clock */
+
 // Convenience functions
 
 template <typename Func, typename T>
@@ -1579,8 +2041,65 @@ struct ReferenceHandler<cl_event>
     { return ::clReleaseEvent(event); }
 };
 
+#ifdef cl_khr_semaphore
+template <>
+struct ReferenceHandler<cl_semaphore_khr>
+{
+    static cl_int retain(cl_semaphore_khr semaphore)
+    { 
+        if (pfn_clRetainSemaphoreKHR != nullptr) {
+            return pfn_clRetainSemaphoreKHR(semaphore);
+        }
+
+        return CL_INVALID_OPERATION;
+    }
+
+    static cl_int release(cl_semaphore_khr semaphore)
+    {
+        if (pfn_clReleaseSemaphoreKHR != nullptr) {
+            return pfn_clReleaseSemaphoreKHR(semaphore);
+        }
+
+        return CL_INVALID_OPERATION;
+    }
+};
+#endif // cl_khr_semaphore
+#if defined(cl_khr_command_buffer)
+template <>
+struct ReferenceHandler<cl_command_buffer_khr>
+{
+    static cl_int retain(cl_command_buffer_khr cmdBufferKhr)
+    {
+        if (pfn_clRetainCommandBufferKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION, __RETAIN_COMMAND_BUFFER_KHR_ERR);
+        }
+        return pfn_clRetainCommandBufferKHR(cmdBufferKhr);
+    }
+
+    static cl_int release(cl_command_buffer_khr cmdBufferKhr)
+    {
+        if (pfn_clReleaseCommandBufferKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION, __RELEASE_COMMAND_BUFFER_KHR_ERR);
+        }
+        return pfn_clReleaseCommandBufferKHR(cmdBufferKhr);
+    }
+};
+
+template <>
+struct ReferenceHandler<cl_mutable_command_khr>
+{
+    // cl_mutable_command_khr does not have retain().
+    static cl_int retain(cl_mutable_command_khr)
+    { return CL_SUCCESS; }
+    // cl_mutable_command_khr does not have release().
+    static cl_int release(cl_mutable_command_khr)
+    { return CL_SUCCESS; }
+};
+#endif // cl_khr_command_buffer
+
 
-#if CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120
+#if (CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120) || \
+    (CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200)
 // Extracts version number with major in the upper 16 bits, minor in the lower 16
 static cl_uint getVersion(const vector<char> &versionInfo)
 {
@@ -1604,7 +2123,7 @@ static cl_uint getVersion(const vector<char> &versionInfo)
 static cl_uint getPlatformVersion(cl_platform_id platform)
 {
     size_type size = 0;
-    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, NULL, &size);
+    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, nullptr, &size);
 
     vector<char> versionInfo(size);
     clGetPlatformInfo(platform, CL_PLATFORM_VERSION, size, versionInfo.data(), &size);
@@ -1614,7 +2133,7 @@ static cl_uint getPlatformVersion(cl_platform_id platform)
 static cl_uint getDevicePlatformVersion(cl_device_id device)
 {
     cl_platform_id platform;
-    clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, NULL);
+    clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, nullptr);
     return getPlatformVersion(platform);
 }
 
@@ -1623,14 +2142,14 @@ static cl_uint getContextPlatformVersion(cl_context context)
     // The platform cannot be queried directly, so we first have to grab a
     // device and obtain its context
     size_type size = 0;
-    clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size);
+    clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, nullptr, &size);
     if (size == 0)
         return 0;
     vector<cl_device_id> devices(size/sizeof(cl_device_id));
-    clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices.data(), NULL);
+    clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices.data(), nullptr);
     return getDevicePlatformVersion(devices[0]);
 }
-#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120
+#endif // CL_HPP_TARGET_OPENCL_VERSION && CL_HPP_MINIMUM_OPENCL_VERSION
 
 template <typename T>
 class Wrapper
@@ -1642,7 +2161,7 @@ class Wrapper
     cl_type object_;
 
 public:
-    Wrapper() : object_(NULL) { }
+    Wrapper() : object_(nullptr) { }
     
     Wrapper(const cl_type &obj, bool retainObject) : object_(obj) 
     {
@@ -1653,7 +2172,7 @@ class Wrapper
 
     ~Wrapper()
     {
-        if (object_ != NULL) { release(); }
+        if (object_ != nullptr) { release(); }
     }
 
     Wrapper(const Wrapper<cl_type>& rhs)
@@ -1662,10 +2181,10 @@ class Wrapper
         detail::errHandler(retain(), __RETAIN_ERR);
     }
 
-    Wrapper(Wrapper<cl_type>&& rhs) CL_HPP_NOEXCEPT_
+    Wrapper(Wrapper<cl_type>&& rhs) noexcept
     {
         object_ = rhs.object_;
-        rhs.object_ = NULL;
+        rhs.object_ = nullptr;
     }
 
     Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
@@ -1683,7 +2202,7 @@ class Wrapper
         if (this != &rhs) {
             detail::errHandler(release(), __RELEASE_ERR);
             object_ = rhs.object_;
-            rhs.object_ = NULL;
+            rhs.object_ = nullptr;
         }
         return *this;
     }
@@ -1699,10 +2218,7 @@ class Wrapper
 
     cl_type& operator ()() { return object_; }
 
-    const cl_type get() const { return object_; }
-
-    cl_type get() { return object_; }
-
+    cl_type get() const { return object_; }
 
 protected:
     template<typename Func, typename U>
@@ -1742,23 +2258,22 @@ class Wrapper<cl_device_id>
     static bool isReferenceCountable(cl_device_id device)
     {
         bool retVal = false;
-#if CL_HPP_TARGET_OPENCL_VERSION >= 120
-#if CL_HPP_MINIMUM_OPENCL_VERSION < 120
-        if (device != NULL) {
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120
+        if (device != nullptr) {
             int version = getDevicePlatformVersion(device);
             if(version > ((1 << 16) + 1)) {
                 retVal = true;
             }
         }
-#else // CL_HPP_MINIMUM_OPENCL_VERSION < 120
+#elif CL_HPP_TARGET_OPENCL_VERSION >= 120
         retVal = true;
-#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120
-#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+#endif // CL_HPP_TARGET_OPENCL_VERSION
+        (void)device;
         return retVal;
     }
 
 public:
-    Wrapper() : object_(NULL), referenceCountable_(false) 
+    Wrapper() : object_(nullptr), referenceCountable_(false) 
     { 
     }
     
@@ -1785,11 +2300,11 @@ class Wrapper<cl_device_id>
         detail::errHandler(retain(), __RETAIN_ERR);
     }
 
-    Wrapper(Wrapper<cl_type>&& rhs) CL_HPP_NOEXCEPT_
+    Wrapper(Wrapper<cl_type>&& rhs) noexcept
     {
         object_ = rhs.object_;
         referenceCountable_ = rhs.referenceCountable_;
-        rhs.object_ = NULL;
+        rhs.object_ = nullptr;
         rhs.referenceCountable_ = false;
     }
 
@@ -1810,7 +2325,7 @@ class Wrapper<cl_device_id>
             detail::errHandler(release(), __RELEASE_ERR);
             object_ = rhs.object_;
             referenceCountable_ = rhs.referenceCountable_;
-            rhs.object_ = NULL;
+            rhs.object_ = nullptr;
             rhs.referenceCountable_ = false;
         }
         return *this;
@@ -1874,51 +2389,7 @@ inline bool operator!=(const Wrapper<T> &lhs, const Wrapper<T> &rhs)
 //! \endcond
 
 
-using BuildLogType = vector<std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info, CL_PROGRAM_BUILD_LOG>::param_type>>;
-#if defined(CL_HPP_ENABLE_EXCEPTIONS)
-/**
-* Exception class for build errors to carry build info
-*/
-class BuildError : public Error
-{
-private:
-    BuildLogType buildLogs;
-public:
-    BuildError(cl_int err, const char * errStr, const BuildLogType &vec) : Error(err, errStr), buildLogs(vec)
-    {
-    }
-
-    BuildLogType getBuildLog() const
-    {
-        return buildLogs;
-    }
-};
-namespace detail {
-    static inline cl_int buildErrHandler(
-        cl_int err,
-        const char * errStr,
-        const BuildLogType &buildLogs)
-    {
-        if (err != CL_SUCCESS) {
-            throw BuildError(err, errStr, buildLogs);
-        }
-        return err;
-    }
-} // namespace detail
 
-#else
-namespace detail {
-    static inline cl_int buildErrHandler(
-        cl_int err,
-        const char * errStr,
-        const BuildLogType &buildLogs)
-    {
-        (void)buildLogs; // suppress unused variable warning
-        (void)errStr;
-        return err;
-    }
-} // namespace detail
-#endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS)
 
 
 /*! \stuct ImageFormat
@@ -1938,6 +2409,9 @@ struct ImageFormat : public cl_image_format
         image_channel_data_type = type;
     }
 
+    //! \brief Copy constructor.
+    ImageFormat(const ImageFormat &other) { *this = other; }
+
     //! \brief Assignment operator.
     ImageFormat& operator = (const ImageFormat& rhs)
     {
@@ -1992,7 +2466,7 @@ class Device : public detail::Wrapper<cl_device_id>
     }
 #endif // #ifdef CL_HPP_UNIT_TEST_ENABLE
 
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     Device() : detail::Wrapper<cl_type>() { }
 
     /*! \brief Constructor from cl_device_id.
@@ -2007,11 +2481,11 @@ class Device : public detail::Wrapper<cl_device_id>
      *  \see Context::getDefault()
      */
     static Device getDefault(
-        cl_int *errResult = NULL)
+        cl_int *errResult = nullptr)
     {
         std::call_once(default_initialized_, makeDefault);
         detail::errHandler(default_error_);
-        if (errResult != NULL) {
+        if (errResult != nullptr) {
             *errResult = default_error_;
         }
         return default_;
@@ -2040,34 +2514,7 @@ class Device : public detail::Wrapper<cl_device_id>
         detail::Wrapper<cl_type>::operator=(rhs);
         return *this;
     }
-
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-    * Required for MSVC.
-    */
-    Device(const Device& dev) : detail::Wrapper<cl_type>(dev) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-    * Required for MSVC.
-    */
-    Device& operator = (const Device &dev)
-    {
-        detail::Wrapper<cl_type>::operator=(dev);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-    * Required for MSVC.
-    */
-    Device(Device&& dev) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(dev)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-    * Required for MSVC.
-    */
-    Device& operator = (Device &&dev)
-    {
-        detail::Wrapper<cl_type>::operator=(std::move(dev));
-        return *this;
-    }
+ 
 
     //! \brief Wrapper for clGetDeviceInfo().
     template <typename T>
@@ -2079,104 +2526,123 @@ class Device : public detail::Wrapper<cl_device_id>
     }
 
     //! \brief Wrapper for clGetDeviceInfo() that returns by value.
-    template <cl_int name> typename
+    template <cl_device_info name> typename
     detail::param_traits<detail::cl_device_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
+    getInfo(cl_int* err = nullptr) const
     {
         typename detail::param_traits<
             detail::cl_device_info, name>::param_type param;
         cl_int result = getInfo(name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
     }
 
+#if CL_HPP_TARGET_OPENCL_VERSION >= 210
     /**
-     * CL 1.2 version
-     */
-#if CL_HPP_TARGET_OPENCL_VERSION >= 120
-    //! \brief Wrapper for clCreateSubDevices().
-    cl_int createSubDevices(
-        const cl_device_partition_property * properties,
-        vector<Device>* devices)
-    {
-        cl_uint n = 0;
-        cl_int err = clCreateSubDevices(object_, properties, 0, NULL, &n);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);
+     * Return the current value of the host clock as seen by the device.
+     * The resolution of the device timer may be queried with the
+     * CL_DEVICE_PROFILING_TIMER_RESOLUTION query.
+     * @return The host timer value.
+     */
+    cl_ulong getHostTimer(cl_int *error = nullptr)
+    {
+        cl_ulong retVal = 0;
+        cl_int err = 
+            clGetHostTimer(this->get(), &retVal);
+        detail::errHandler(
+            err,
+            __GET_HOST_TIMER_ERR);
+        if (error) {
+            *error = err;
         }
+        return retVal;
+    }
 
-        vector<cl_device_id> ids(n);
-        err = clCreateSubDevices(object_, properties, n, ids.data(), NULL);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);
+    /**
+     * Return a synchronized pair of host and device timestamps as seen by device.
+     * Use to correlate the clocks and get the host timer only using getHostTimer
+     * as a lower cost mechanism in between calls.
+     * The resolution of the host timer may be queried with the 
+     * CL_PLATFORM_HOST_TIMER_RESOLUTION query.
+     * The resolution of the device timer may be queried with the
+     * CL_DEVICE_PROFILING_TIMER_RESOLUTION query.
+     * @return A pair of (device timer, host timer) timer values.
+     */
+    std::pair<cl_ulong, cl_ulong> getDeviceAndHostTimer(cl_int *error = nullptr)
+    {
+        std::pair<cl_ulong, cl_ulong> retVal;
+        cl_int err =
+            clGetDeviceAndHostTimer(this->get(), &(retVal.first), &(retVal.second));
+        detail::errHandler(
+            err,
+            __GET_DEVICE_AND_HOST_TIMER_ERR);
+        if (error) {
+            *error = err;
         }
+        return retVal;
+    }
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
 
-        // Cannot trivially assign because we need to capture intermediates 
-        // with safe construction
-        if (devices) {
-            devices->resize(ids.size());
-
-            // Assign to param, constructing with retain behaviour
-            // to correctly capture each underlying CL object
-            for (size_type i = 0; i < ids.size(); i++) {
-                // We do not need to retain because this device is being created 
-                // by the runtime
-                (*devices)[i] = Device(ids[i], false);
-            }
-        }
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+    //! \brief Wrapper for clCreateSubDevices().
+    cl_int createSubDevices(const cl_device_partition_property* properties,
+                            vector<Device>* devices);
+#endif // defined (CL_HPP_TARGET_OPENCL_VERSION >= 120)
 
-        return CL_SUCCESS;
-    }
-#elif defined(CL_HPP_USE_CL_DEVICE_FISSION)
+#if defined(cl_ext_device_fission)
+    //! \brief Wrapper for clCreateSubDevices().
+    cl_int createSubDevices(const cl_device_partition_property_ext* properties,
+                            vector<Device>* devices);
+#endif // defined(cl_ext_device_fission)
+};
 
+using BuildLogType = vector<std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info, CL_PROGRAM_BUILD_LOG>::param_type>>;
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
 /**
- * CL 1.1 version that uses device fission extension.
- */
-    cl_int createSubDevices(
-        const cl_device_partition_property_ext * properties,
-        vector<Device>* devices)
+* Exception class for build errors to carry build info
+*/
+class BuildError : public Error
+{
+private:
+    BuildLogType buildLogs;
+public:
+    BuildError(cl_int err, const char * errStr, const BuildLogType &vec) : Error(err, errStr), buildLogs(vec)
     {
-        typedef CL_API_ENTRY cl_int 
-            ( CL_API_CALL * PFN_clCreateSubDevicesEXT)(
-                cl_device_id /*in_device*/,
-                const cl_device_partition_property_ext * /* properties */,
-                cl_uint /*num_entries*/,
-                cl_device_id * /*out_devices*/,
-                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
-
-        static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL;
-        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateSubDevicesEXT);
+    }
 
-        cl_uint n = 0;
-        cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, NULL, &n);
+    BuildLogType getBuildLog() const
+    {
+        return buildLogs;
+    }
+};
+namespace detail {
+    static inline cl_int buildErrHandler(
+        cl_int err,
+        const char * errStr,
+        const BuildLogType &buildLogs)
+    {
         if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);
+            throw BuildError(err, errStr, buildLogs);
         }
+        return err;
+    }
+} // namespace detail
 
-        vector<cl_device_id> ids(n);
-        err = pfn_clCreateSubDevicesEXT(object_, properties, n, ids.data(), NULL);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);
-        }
-        // Cannot trivially assign because we need to capture intermediates 
-        // with safe construction
-        if (devices) {
-            devices->resize(ids.size());
-
-            // Assign to param, constructing with retain behaviour
-            // to correctly capture each underlying CL object
-            for (size_type i = 0; i < ids.size(); i++) {
-                // We do not need to retain because this device is being created 
-                // by the runtime
-                (*devices)[i] = Device(ids[i], false);
-            }
-        }
-        return CL_SUCCESS;
+#else
+namespace detail {
+    static inline cl_int buildErrHandler(
+        cl_int err,
+        const char * errStr,
+        const BuildLogType &buildLogs)
+    {
+        (void)buildLogs; // suppress unused variable warning
+        (void)errStr;
+        return err;
     }
-#endif // defined(CL_HPP_USE_CL_DEVICE_FISSION)
-};
+} // namespace detail
+#endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS)
 
 CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag Device::default_initialized_;
 CL_HPP_DEFINE_STATIC_MEMBER_ Device Device::default_;
@@ -2213,7 +2679,7 @@ class Platform : public detail::Wrapper<cl_platform_id>
             // Otherwise set it
             cl_uint n = 0;
 
-            cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+            cl_int err = ::clGetPlatformIDs(0, nullptr, &n);
             if (err != CL_SUCCESS) {
                 default_error_ = err;
                 return;
@@ -2224,7 +2690,7 @@ class Platform : public detail::Wrapper<cl_platform_id>
             }
 
             vector<cl_platform_id> ids(n);
-            err = ::clGetPlatformIDs(n, ids.data(), NULL);
+            err = ::clGetPlatformIDs(n, ids.data(), nullptr);
             if (err != CL_SUCCESS) {
                 default_error_ = err;
                 return;
@@ -2261,7 +2727,7 @@ class Platform : public detail::Wrapper<cl_platform_id>
     }
 #endif // #ifdef CL_HPP_UNIT_TEST_ENABLE
 
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     Platform() : detail::Wrapper<cl_type>()  { }
 
     /*! \brief Constructor from cl_platform_id.
@@ -2285,11 +2751,11 @@ class Platform : public detail::Wrapper<cl_platform_id>
     }
 
     static Platform getDefault(
-        cl_int *errResult = NULL)
+        cl_int *errResult = nullptr)
     {
         std::call_once(default_initialized_, makeDefault);
         detail::errHandler(default_error_);
-        if (errResult != NULL) {
+        if (errResult != nullptr) {
             *errResult = default_error_;
         }
         return default_;
@@ -2310,7 +2776,8 @@ class Platform : public detail::Wrapper<cl_platform_id>
     }
 
     //! \brief Wrapper for clGetPlatformInfo().
-    cl_int getInfo(cl_platform_info name, string* param) const
+    template <typename T>
+    cl_int getInfo(cl_platform_info name, T* param) const
     {
         return detail::errHandler(
             detail::getInfo(&::clGetPlatformInfo, object_, name, param),
@@ -2318,14 +2785,14 @@ class Platform : public detail::Wrapper<cl_platform_id>
     }
 
     //! \brief Wrapper for clGetPlatformInfo() that returns by value.
-    template <cl_int name> typename
+    template <cl_platform_info name> typename
     detail::param_traits<detail::cl_platform_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
+    getInfo(cl_int* err = nullptr) const
     {
         typename detail::param_traits<
             detail::cl_platform_info, name>::param_type param;
         cl_int result = getInfo(name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
@@ -2340,18 +2807,20 @@ class Platform : public detail::Wrapper<cl_platform_id>
         vector<Device>* devices) const
     {
         cl_uint n = 0;
-        if( devices == NULL ) {
+        if( devices == nullptr ) {
             return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
         }
-        cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n);
-        if (err != CL_SUCCESS) {
+        cl_int err = ::clGetDeviceIDs(object_, type, 0, nullptr, &n);
+        if (err != CL_SUCCESS  && err != CL_DEVICE_NOT_FOUND) {
             return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
         }
 
         vector<cl_device_id> ids(n);
-        err = ::clGetDeviceIDs(object_, type, n, ids.data(), NULL);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        if (n>0) {
+            err = ::clGetDeviceIDs(object_, type, n, ids.data(), nullptr);
+            if (err != CL_SUCCESS) {
+                return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+            }
         }
 
         // Cannot trivially assign because we need to capture intermediates 
@@ -2381,7 +2850,7 @@ class Platform : public detail::Wrapper<cl_platform_id>
      *
      *  \param devices returns a vector of OpenCL D3D10 devices found. The cl::Device
      *  values returned in devices can be used to identify a specific OpenCL
-     *  device. If \a devices argument is NULL, this argument is ignored.
+     *  device. If \a devices argument is nullptr, this argument is ignored.
      *
      *  \return One of the following values:
      *    - CL_SUCCESS if the function is executed successfully.
@@ -2409,12 +2878,17 @@ class Platform : public detail::Wrapper<cl_platform_id>
             cl_device_id * devices,
             cl_uint* num_devices);
 
-        if( devices == NULL ) {
+        if( devices == nullptr ) {
             return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
         }
 
-        static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = NULL;
+        static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = nullptr;
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
         CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(object_, clGetDeviceIDsFromD3D10KHR);
+#endif
+#if CL_HPP_MINIMUM_OPENCL_VERSION < 120
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetDeviceIDsFromD3D10KHR);
+#endif
 
         cl_uint n = 0;
         cl_int err = pfn_clGetDeviceIDsFromD3D10KHR(
@@ -2423,7 +2897,7 @@ class Platform : public detail::Wrapper<cl_platform_id>
             d3d_object,
             d3d_device_set, 
             0, 
-            NULL, 
+            nullptr, 
             &n);
         if (err != CL_SUCCESS) {
             return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
@@ -2437,7 +2911,7 @@ class Platform : public detail::Wrapper<cl_platform_id>
             d3d_device_set,
             n, 
             ids.data(), 
-            NULL);
+            nullptr);
         if (err != CL_SUCCESS) {
             return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
         }
@@ -2468,17 +2942,17 @@ class Platform : public detail::Wrapper<cl_platform_id>
     {
         cl_uint n = 0;
 
-        if( platforms == NULL ) {
+        if( platforms == nullptr ) {
             return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
         }
 
-        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+        cl_int err = ::clGetPlatformIDs(0, nullptr, &n);
         if (err != CL_SUCCESS) {
             return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
         }
 
         vector<cl_platform_id> ids(n);
-        err = ::clGetPlatformIDs(n, ids.data(), NULL);
+        err = ::clGetPlatformIDs(n, ids.data(), nullptr);
         if (err != CL_SUCCESS) {
             return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
         }
@@ -2518,7 +2992,7 @@ class Platform : public detail::Wrapper<cl_platform_id>
      * Wraps clGetPlatformIDs(), returning the first result.
      */
     static Platform get(
-        cl_int * errResult = NULL)
+        cl_int * errResult = nullptr)
     {
         cl_int err;
         Platform default_platform = Platform::getDefault(&err);
@@ -2538,6 +3012,93 @@ class Platform : public detail::Wrapper<cl_platform_id>
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
 }; // class Platform
 
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+   //! \brief Wrapper for clCreateSubDevices().
+inline cl_int Device::createSubDevices(const cl_device_partition_property* properties,
+                         vector<Device>* devices)
+{
+    cl_uint n = 0;
+    cl_int err = clCreateSubDevices(object_, properties, 0, nullptr, &n);
+    if (err != CL_SUCCESS)
+    {
+        return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);
+    }
+
+    vector<cl_device_id> ids(n);
+    err = clCreateSubDevices(object_, properties, n, ids.data(), nullptr);
+    if (err != CL_SUCCESS)
+    {
+        return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);
+    }
+
+    // Cannot trivially assign because we need to capture intermediates
+    // with safe construction
+    if (devices)
+    {
+        devices->resize(ids.size());
+
+        // Assign to param, constructing with retain behaviour
+        // to correctly capture each underlying CL object
+        for (size_type i = 0; i < ids.size(); i++)
+        {
+            // We do not need to retain because this device is being created
+            // by the runtime
+            (*devices)[i] = Device(ids[i], false);
+        }
+    }
+
+    return CL_SUCCESS;
+}
+#endif // defined (CL_HPP_TARGET_OPENCL_VERSION >= 120)
+
+#if defined(cl_ext_device_fission)
+   //! \brief Wrapper for clCreateSubDevices().
+inline cl_int Device::createSubDevices(const cl_device_partition_property_ext* properties,
+                        vector<Device>* devices)
+{
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+    cl::Device device(object_);
+    cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>()();
+    CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCreateSubDevicesEXT);
+#endif
+#if CL_HPP_MINIMUM_OPENCL_VERSION < 120
+    CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateSubDevicesEXT);
+#endif
+
+    cl_uint n = 0;
+    cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, nullptr, &n);
+    if (err != CL_SUCCESS)
+    {
+        return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);
+    }
+
+    vector<cl_device_id> ids(n);
+    err =
+        pfn_clCreateSubDevicesEXT(object_, properties, n, ids.data(), nullptr);
+    if (err != CL_SUCCESS)
+    {
+        return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);
+    }
+    // Cannot trivially assign because we need to capture intermediates
+    // with safe construction
+    if (devices)
+    {
+        devices->resize(ids.size());
+
+        // Assign to param, constructing with retain behaviour
+        // to correctly capture each underlying CL object
+        for (size_type i = 0; i < ids.size(); i++)
+        {
+            // We do not need to retain because this device is being created
+            // by the runtime
+            (*devices)[i] = Device(ids[i], false);
+        }
+    }
+
+    return CL_SUCCESS;
+}
+#endif // defined(cl_ext_device_fission)
+
 CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag Platform::default_initialized_;
 CL_HPP_DEFINE_STATIC_MEMBER_ Platform Platform::default_;
 CL_HPP_DEFINE_STATIC_MEMBER_ cl_int Platform::default_error_ = CL_SUCCESS;
@@ -2551,8 +3112,8 @@ CL_HPP_DEFINE_STATIC_MEMBER_ cl_int Platform::default_error_ = CL_SUCCESS;
  * Unload the OpenCL compiler.
  * \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead.
  */
-inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int
-UnloadCompiler() CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+inline CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_int
+UnloadCompiler() CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
 inline cl_int
 UnloadCompiler()
 {
@@ -2560,6 +3121,25 @@ UnloadCompiler()
 }
 #endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
 
+
+#if defined(cl_ext_image_requirements_info)
+enum ImageRequirementsInfoExt : cl_image_requirements_info_ext
+{
+    RowPitchAlign = CL_IMAGE_REQUIREMENTS_ROW_PITCH_ALIGNMENT_EXT,
+    BaseAddAlign = CL_IMAGE_REQUIREMENTS_BASE_ADDRESS_ALIGNMENT_EXT,
+    Size = CL_IMAGE_REQUIREMENTS_SIZE_EXT,
+    MaxWidth = CL_IMAGE_REQUIREMENTS_MAX_WIDTH_EXT,
+    MaxHeight = CL_IMAGE_REQUIREMENTS_MAX_HEIGHT_EXT,
+    MaxDepth = CL_IMAGE_REQUIREMENTS_MAX_DEPTH_EXT,
+    MaxArraySize = CL_IMAGE_REQUIREMENTS_MAX_ARRAY_SIZE_EXT,
+#if defined(cl_ext_image_from_buffer)
+    SlicePitchAlign = CL_IMAGE_REQUIREMENTS_SLICE_PITCH_ALIGNMENT_EXT,
+#endif
+};
+
+#endif // cl_ext_image_requirements_info
+
+
 /*! \brief Class interface for cl_context.
  *
  *  \note Copies of these objects are shallow, meaning that the copy will refer
@@ -2602,8 +3182,8 @@ class Context
             default_ = Context(
                 CL_DEVICE_TYPE_DEFAULT,
                 properties,
-                NULL,
-                NULL,
+                nullptr,
+                nullptr,
                 &default_error_);
         }
 #if defined(CL_HPP_ENABLE_EXCEPTIONS)
@@ -2622,6 +3202,51 @@ class Context
     static void makeDefaultProvided(const Context &c) {
         default_ = c;
     }
+
+#if defined(cl_ext_image_requirements_info)
+    struct ImageRequirementsInfo {
+
+        ImageRequirementsInfo(cl_mem_flags f, const cl_mem_properties* mem_properties, const ImageFormat* format, const cl_image_desc* desc)
+        {
+            flags = f;
+            properties = mem_properties;
+            image_format = format;
+            image_desc = desc;
+        }
+
+        cl_mem_flags flags = 0;
+        const cl_mem_properties* properties;
+        const ImageFormat* image_format;
+        const cl_image_desc* image_desc;
+    };
+
+    static cl_int getImageRequirementsInfoExtHelper(const Context &context,
+        const ImageRequirementsInfo &info,
+        cl_image_requirements_info_ext param_name,
+        size_type param_value_size,
+        void* param_value,
+        size_type* param_value_size_ret)
+    {
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+        Device device = context.getInfo<CL_CONTEXT_DEVICES>().at(0);
+        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>()();
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clGetImageRequirementsInfoEXT);
+#else
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetImageRequirementsInfoEXT);
+#endif
+
+        if (pfn_clGetImageRequirementsInfoEXT == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION, __GET_IMAGE_REQUIREMENT_INFO_EXT_ERR);
+        }
+
+        return detail::errHandler(
+            pfn_clGetImageRequirementsInfoEXT(context(), info.properties,
+                info.flags, info.image_format, info.image_desc, param_name,
+                param_value_size, param_value, param_value_size_ret),
+            __GET_IMAGE_REQUIREMENT_INFO_EXT_ERR);
+    }
+#endif // cl_ext_image_requirements_info
     
 public:
 #ifdef CL_HPP_UNIT_TEST_ENABLE
@@ -2642,14 +3267,14 @@ class Context
      */
     Context(
         const vector<Device>& devices,
-        cl_context_properties* properties = NULL,
+        const cl_context_properties* properties = nullptr,
         void (CL_CALLBACK * notifyFptr)(
             const char *,
             const void *,
             size_type,
-            void *) = NULL,
-        void* data = NULL,
-        cl_int* err = NULL)
+            void *) = nullptr,
+        void* data = nullptr,
+        cl_int* err = nullptr)
     {
         cl_int error;
 
@@ -2666,21 +3291,25 @@ class Context
             notifyFptr, data, &error);
 
         detail::errHandler(error, __CREATE_CONTEXT_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
 
+    /*! \brief Constructs a context including a specific device.
+     *
+     *  Wraps clCreateContext().
+     */
     Context(
         const Device& device,
-        cl_context_properties* properties = NULL,
+        const cl_context_properties* properties = nullptr,
         void (CL_CALLBACK * notifyFptr)(
             const char *,
             const void *,
             size_type,
-            void *) = NULL,
-        void* data = NULL,
-        cl_int* err = NULL)
+            void *) = nullptr,
+        void* data = nullptr,
+        cl_int* err = nullptr)
     {
         cl_int error;
 
@@ -2692,7 +3321,7 @@ class Context
             notifyFptr, data, &error);
 
         detail::errHandler(error, __CREATE_CONTEXT_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -2703,27 +3332,27 @@ class Context
      */
     Context(
         cl_device_type type,
-        cl_context_properties* properties = NULL,
+        const cl_context_properties* properties = nullptr,
         void (CL_CALLBACK * notifyFptr)(
             const char *,
             const void *,
             size_type,
-            void *) = NULL,
-        void* data = NULL,
-        cl_int* err = NULL)
+            void *) = nullptr,
+        void* data = nullptr,
+        cl_int* err = nullptr)
     {
         cl_int error;
 
 #if !defined(__APPLE__) && !defined(__MACOS)
         cl_context_properties prop[4] = {CL_CONTEXT_PLATFORM, 0, 0, 0 };
 
-        if (properties == NULL) {
+        if (properties == nullptr) {
             // Get a valid platform ID as we cannot send in a blank one
             vector<Platform> platforms;
             error = Platform::get(&platforms);
             if (error != CL_SUCCESS) {
                 detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
-                if (err != NULL) {
+                if (err != nullptr) {
                     *err = error;
                 }
                 return;
@@ -2752,7 +3381,7 @@ class Context
                 // Only squash CL_SUCCESS and CL_DEVICE_NOT_FOUND
                 if (error != CL_SUCCESS && error != CL_DEVICE_NOT_FOUND) {
                     detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
-                    if (err != NULL) {
+                    if (err != nullptr) {
                         *err = error;
                     }
                 }
@@ -2765,7 +3394,7 @@ class Context
 
             if (platform_id == 0) {
                 detail::errHandler(CL_DEVICE_NOT_FOUND, __CREATE_CONTEXT_FROM_TYPE_ERR);
-                if (err != NULL) {
+                if (err != nullptr) {
                     *err = CL_DEVICE_NOT_FOUND;
                 }
                 return;
@@ -2779,49 +3408,21 @@ class Context
             properties, type, notifyFptr, data, &error);
 
         detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Context(const Context& ctx) : detail::Wrapper<cl_type>(ctx) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Context& operator = (const Context &ctx)
-    {
-        detail::Wrapper<cl_type>::operator=(ctx);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Context(Context&& ctx) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(ctx)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Context& operator = (Context &&ctx)
-    {
-        detail::Wrapper<cl_type>::operator=(std::move(ctx));
-        return *this;
-    }
-
 
     /*! \brief Returns a singleton context including all devices of CL_DEVICE_TYPE_DEFAULT.
      *
      *  \note All calls to this function return the same cl_context as the first.
      */
-    static Context getDefault(cl_int * err = NULL) 
+    static Context getDefault(cl_int * err = nullptr) 
     {
         std::call_once(default_initialized_, makeDefault);
         detail::errHandler(default_error_);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = default_error_;
         }
         return default_;
@@ -2841,7 +3442,7 @@ class Context
         return default_;
     }
 
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     Context() : detail::Wrapper<cl_type>() { }
 
     /*! \brief Constructor from cl_context - takes ownership.
@@ -2873,14 +3474,14 @@ class Context
     }
 
     //! \brief Wrapper for clGetContextInfo() that returns by value.
-    template <cl_int name> typename
+    template <cl_context_info name> typename
     detail::param_traits<detail::cl_context_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
+    getInfo(cl_int* err = nullptr) const
     {
         typename detail::param_traits<
             detail::cl_context_info, name>::param_type param;
         cl_int result = getInfo(name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
@@ -2906,7 +3507,7 @@ class Context
            flags,
            type, 
            0, 
-           NULL, 
+           nullptr, 
            &numEntries);
         if (err != CL_SUCCESS) {
             return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
@@ -2920,12 +3521,12 @@ class Context
                 type,
                 numEntries,
                 (cl_image_format*)value.data(),
-                NULL);
+                nullptr);
             if (err != CL_SUCCESS) {
                 return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
             }
 
-            formats->assign(begin(value), end(value));
+            formats->assign(value.begin(), value.end());
         }
         else {
             // If no values are being returned, ensure an empty vector comes back
@@ -2934,6 +3535,65 @@ class Context
 
         return CL_SUCCESS;
     }
+
+#if defined(cl_ext_image_requirements_info)
+    template <typename T>
+    cl_int getImageRequirementsInfoExt(cl_image_requirements_info_ext name,
+        T* param,
+        cl_mem_flags flags = 0,
+        const cl_mem_properties* properties = nullptr,
+        const ImageFormat* image_format = nullptr,
+        const cl_image_desc* image_desc = nullptr) const
+    {
+        ImageRequirementsInfo imageInfo = {flags, properties, image_format, image_desc};
+
+        return detail::errHandler(
+            detail::getInfo(
+                Context::getImageRequirementsInfoExtHelper, *this, imageInfo, name, param),
+                __GET_IMAGE_REQUIREMENT_INFO_EXT_ERR);
+    }
+
+    template <cl_image_requirements_info_ext type> typename
+    detail::param_traits<detail::cl_image_requirements_info_ext, type>::param_type
+        getImageRequirementsInfoExt(cl_mem_flags flags = 0,
+            const cl_mem_properties* properties = nullptr,
+            const ImageFormat* image_format = nullptr,
+            const cl_image_desc* image_desc = nullptr,
+            cl_int* err = nullptr) const
+    {
+        typename detail::param_traits<
+        detail::cl_image_requirements_info_ext, type>::param_type param;
+        cl_int result = getImageRequirementsInfoExt(type, &param, flags, properties, image_format, image_desc);
+        if (err != nullptr) {
+            *err = result;
+        }
+        return param;
+    }
+#endif // cl_ext_image_requirements_info
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 300
+    /*! \brief  Registers a destructor callback function with a context.
+     *
+     *  Wraps clSetContextDestructorCallback().
+     * 
+     * Each call to this function registers the specified callback function on
+     * a destructor callback stack associated with context. The registered
+     * callback functions are called in the reverse order in which they were registered.
+     * If a context callback function was specified when context was created,
+     * it will not be called after any context destructor callback is called.
+     */
+    cl_int setDestructorCallback(
+        void (CL_CALLBACK * pfn_notify)(cl_context, void *),
+        void * user_data = nullptr)
+    {
+        return detail::errHandler(
+            ::clSetContextDestructorCallback(
+                object_,
+                pfn_notify,
+                user_data),
+                __SET_CONTEXT_DESCTRUCTOR_CALLBACK_ERR);
+    }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 300
 };
 
 inline void Device::makeDefault()
@@ -2980,7 +3640,7 @@ CL_HPP_DEFINE_STATIC_MEMBER_ cl_int Context::default_error_ = CL_SUCCESS;
 class Event : public detail::Wrapper<cl_event>
 {
 public:
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     Event() : detail::Wrapper<cl_type>() { }
 
     /*! \brief Constructor from cl_event - takes ownership.
@@ -3015,14 +3675,14 @@ class Event : public detail::Wrapper<cl_event>
     }
 
     //! \brief Wrapper for clGetEventInfo() that returns by value.
-    template <cl_int name> typename
+    template <cl_event_info name> typename
     detail::param_traits<detail::cl_event_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
+    getInfo(cl_int* err = nullptr) const
     {
         typename detail::param_traits<
             detail::cl_event_info, name>::param_type param;
         cl_int result = getInfo(name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
@@ -3038,14 +3698,14 @@ class Event : public detail::Wrapper<cl_event>
     }
 
     //! \brief Wrapper for clGetEventProfilingInfo() that returns by value.
-    template <cl_int name> typename
+    template <cl_profiling_info name> typename
     detail::param_traits<detail::cl_profiling_info, name>::param_type
-    getProfilingInfo(cl_int* err = NULL) const
+    getProfilingInfo(cl_int* err = nullptr) const
     {
         typename detail::param_traits<
             detail::cl_profiling_info, name>::param_type param;
         cl_int result = getProfilingInfo(name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
@@ -3069,8 +3729,8 @@ class Event : public detail::Wrapper<cl_event>
      */
     cl_int setCallback(
         cl_int type,
-        void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *),		
-        void * user_data = NULL)
+        void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *),
+        void * user_data = nullptr)
     {
         return detail::errHandler(
             ::clSetEventCallback(
@@ -3089,9 +3749,12 @@ class Event : public detail::Wrapper<cl_event>
     static cl_int
     waitForEvents(const vector<Event>& events)
     {
+        static_assert(sizeof(cl::Event) == sizeof(cl_event),
+        "Size of cl::Event must be equal to size of cl_event");
+
         return detail::errHandler(
             ::clWaitForEvents(
-                (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : NULL),
+                (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : nullptr),
             __WAIT_FOR_EVENTS_ERR);
     }
 };
@@ -3110,7 +3773,7 @@ class UserEvent : public Event
      */
     UserEvent(
         const Context& context,
-        cl_int * err = NULL)
+        cl_int * err = nullptr)
     {
         cl_int error;
         object_ = ::clCreateUserEvent(
@@ -3118,12 +3781,12 @@ class UserEvent : public Event
             &error);
 
         detail::errHandler(error, __CREATE_USER_EVENT_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
 
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     UserEvent() : Event() { }
 
     /*! \brief Sets the execution status of a user event object.
@@ -3148,7 +3811,7 @@ WaitForEvents(const vector<Event>& events)
 {
     return detail::errHandler(
         ::clWaitForEvents(
-            (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : NULL),
+            (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : nullptr),
         __WAIT_FOR_EVENTS_ERR);
 }
 
@@ -3163,7 +3826,7 @@ WaitForEvents(const vector<Event>& events)
 class Memory : public detail::Wrapper<cl_mem>
 {
 public:
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     Memory() : detail::Wrapper<cl_type>() { }
 
     /*! \brief Constructor from cl_mem - takes ownership.
@@ -3191,35 +3854,6 @@ class Memory : public detail::Wrapper<cl_mem>
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Memory(const Memory& mem) : detail::Wrapper<cl_type>(mem) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Memory& operator = (const Memory &mem)
-    {
-        detail::Wrapper<cl_type>::operator=(mem);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Memory(Memory&& mem) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(mem)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Memory& operator = (Memory &&mem)
-    {
-        detail::Wrapper<cl_type>::operator=(std::move(mem));
-        return *this;
-    }
-
-
     //! \brief Wrapper for clGetMemObjectInfo().
     template <typename T>
     cl_int getInfo(cl_mem_info name, T* param) const
@@ -3230,14 +3864,14 @@ class Memory : public detail::Wrapper<cl_mem>
     }
 
     //! \brief Wrapper for clGetMemObjectInfo() that returns by value.
-    template <cl_int name> typename
+    template <cl_mem_info name> typename
     detail::param_traits<detail::cl_mem_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
+    getInfo(cl_int* err = nullptr) const
     {
         typename detail::param_traits<
             detail::cl_mem_info, name>::param_type param;
         cl_int result = getInfo(name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
@@ -3258,8 +3892,8 @@ class Memory : public detail::Wrapper<cl_mem>
      *  value - not the Memory class instance.
      */
     cl_int setDestructorCallback(
-        void (CL_CALLBACK * pfn_notify)(cl_mem, void *),		
-        void * user_data = NULL)
+        void (CL_CALLBACK * pfn_notify)(cl_mem, void *),
+        void * user_data = nullptr)
     {
         return detail::errHandler(
             ::clSetMemObjectDestructorCallback(
@@ -3371,8 +4005,8 @@ inline cl_int enqueueMapSVM(
     cl_bool blocking,
     cl_map_flags flags,
     size_type size,
-    const vector<Event>* events = NULL,
-    Event* event = NULL);
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr);
 
 /**
  * STL-like allocator class for managing SVM objects provided for convenience.
@@ -3434,12 +4068,12 @@ class SVMAllocator {
     {
     }
 
-    pointer address(reference r) CL_HPP_NOEXCEPT_
+    pointer address(reference r) noexcept
     {
         return std::addressof(r);
     }
 
-    const_pointer address(const_reference r) CL_HPP_NOEXCEPT_
+    const_pointer address(const_reference r) noexcept
     {
         return std::addressof(r);
     }
@@ -3452,7 +4086,8 @@ class SVMAllocator {
      */
     pointer allocate(
         size_type size,
-        typename cl::SVMAllocator<void, SVMTrait>::const_pointer = 0)
+        typename cl::SVMAllocator<void, SVMTrait>::const_pointer = 0,
+        bool map = true)
     {
         // Allocate memory with default alignment matching the size of the type
         void* voidPointer =
@@ -3471,11 +4106,15 @@ class SVMAllocator {
 #endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS)
 
         // If allocation was coarse-grained then map it
-        if (!(SVMTrait::getSVMMemFlags() & CL_MEM_SVM_FINE_GRAIN_BUFFER)) {
+        if (map && !(SVMTrait::getSVMMemFlags() & CL_MEM_SVM_FINE_GRAIN_BUFFER)) {
             cl_int err = enqueueMapSVM(retValue, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, size*sizeof(T));
             if (err != CL_SUCCESS) {
+                clSVMFree(context_(), retValue);
+                retValue = nullptr;
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
                 std::bad_alloc excep;
                 throw excep;
+#endif
             }
         }
 
@@ -3492,7 +4131,7 @@ class SVMAllocator {
      * Return the maximum possible allocation size.
      * This is the minimum of the maximum sizes of all devices in the context.
      */
-    size_type max_size() const CL_HPP_NOEXCEPT_
+    size_type max_size() const noexcept
     {
         size_type maxSize = std::numeric_limits<size_type>::max() / sizeof(T);
 
@@ -3601,7 +4240,7 @@ cl::pointer<T, detail::Deleter<Alloc>> allocate_pointer(const Alloc &alloc_, Arg
 
         return cl::pointer<T, detail::Deleter<Alloc>>(tmp, detail::Deleter<Alloc>{alloc, copies});
     }
-    catch (std::bad_alloc b)
+    catch (std::bad_alloc&)
     {
         std::allocator_traits<Alloc>::deallocate(alloc, tmp, copies);
         throw;
@@ -3665,44 +4304,90 @@ class Buffer : public Memory
         const Context& context,
         cl_mem_flags flags,
         size_type size,
-        void* host_ptr = NULL,
-        cl_int* err = NULL)
+        void* host_ptr = nullptr,
+        cl_int* err = nullptr)
     {
         cl_int error;
         object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
 
         detail::errHandler(error, __CREATE_BUFFER_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
 
-    /*! \brief Constructs a Buffer in the default context.
+#if CL_HPP_TARGET_OPENCL_VERSION >= 300
+    /*! \brief Constructs a Buffer in a specified context and with specified properties.
      *
-     *  Wraps clCreateBuffer().
+     *  Wraps clCreateBufferWithProperties().
      *
+     *  \param properties Optional list of properties for the buffer object and
+     *                    their corresponding values. The non-empty list must
+     *                    end with 0. 
      *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
-     *                  specified.  Note alignment & exclusivity requirements.
-     *
-     *  \see Context::getDefault()
+     *                  specified. Note alignment & exclusivity requirements.
      */
     Buffer(
-         cl_mem_flags flags,
+        const Context& context,
+        const vector<cl_mem_properties>& properties,
+        cl_mem_flags flags,
         size_type size,
-        void* host_ptr = NULL,
-        cl_int* err = NULL)
+        void* host_ptr = nullptr,
+        cl_int* err = nullptr)
     {
         cl_int error;
 
-        Context context = Context::getDefault(err);
-
-        object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
+        if (properties.empty()) {
+            object_ = ::clCreateBufferWithProperties(context(), nullptr, flags,
+                                                     size, host_ptr, &error);
+        }
+        else {
+            object_ = ::clCreateBufferWithProperties(
+                context(), properties.data(), flags, size, host_ptr, &error);
+        }
 
         detail::errHandler(error, __CREATE_BUFFER_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
+#endif
+
+    /*! \brief Constructs a Buffer in the default context.
+     *
+     *  Wraps clCreateBuffer().
+     *
+     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
+     *                  specified.  Note alignment & exclusivity requirements.
+     *
+     *  \see Context::getDefault()
+     */
+    Buffer(
+        cl_mem_flags flags,
+        size_type size,
+        void* host_ptr = nullptr,
+        cl_int* err = nullptr) : Buffer(Context::getDefault(err), flags, size, host_ptr, err) { }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 300
+    /*! \brief Constructs a Buffer in the default context and with specified properties.
+     *
+     *  Wraps clCreateBufferWithProperties().
+     *
+     *  \param properties Optional list of properties for the buffer object and
+     *                    their corresponding values. The non-empty list must
+     *                    end with 0. 
+     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
+     *                  specified. Note alignment & exclusivity requirements.
+     * 
+     *  \see Context::getDefault()
+     */
+    Buffer(
+        const vector<cl_mem_properties>& properties,
+        cl_mem_flags flags,
+        size_type size,
+        void* host_ptr = nullptr,
+        cl_int* err = nullptr) : Buffer(Context::getDefault(err), properties, flags, size, host_ptr, err) { }
+#endif
 
     /*!
      * \brief Construct a Buffer from a host container via iterators.
@@ -3715,7 +4400,7 @@ class Buffer : public Memory
         IteratorType endIterator,
         bool readOnly,
         bool useHostPtr = false,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         typedef typename std::iterator_traits<IteratorType>::value_type DataType;
         cl_int error;
@@ -3736,20 +4421,20 @@ class Buffer : public Memory
         Context context = Context::getDefault(err);
 
         if( useHostPtr ) {
-            object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+            object_ = ::clCreateBuffer(context(), flags, size, const_cast<DataType*>(&*startIterator), &error);
         } else {
             object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
         }
 
         detail::errHandler(error, __CREATE_BUFFER_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
 
         if( !useHostPtr ) {
             error = cl::copy(startIterator, endIterator, *this);
             detail::errHandler(error, __CREATE_BUFFER_ERR);
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
         }
@@ -3762,7 +4447,7 @@ class Buffer : public Memory
      */
     template< typename IteratorType >
     Buffer(const Context &context, IteratorType startIterator, IteratorType endIterator,
-        bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
+        bool readOnly, bool useHostPtr = false, cl_int* err = nullptr);
     
     /*!
     * \brief Construct a Buffer from a host container via iterators using a specified queue.
@@ -3770,9 +4455,9 @@ class Buffer : public Memory
     */
     template< typename IteratorType >
     Buffer(const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator,
-        bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
+        bool readOnly, bool useHostPtr = false, cl_int* err = nullptr);
 
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     Buffer() : Memory() { }
 
     /*! \brief Constructor from cl_mem - takes ownership.
@@ -3795,33 +4480,6 @@ class Buffer : public Memory
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Buffer(const Buffer& buf) : Memory(buf) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Buffer& operator = (const Buffer &buf)
-    {
-        Memory::operator=(buf);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Buffer(Buffer&& buf) CL_HPP_NOEXCEPT_ : Memory(std::move(buf)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Buffer& operator = (Buffer &&buf)
-    {
-        Memory::operator=(std::move(buf));
-        return *this;
-    }
 
 #if CL_HPP_TARGET_OPENCL_VERSION >= 110
     /*! \brief Creates a new buffer object from this.
@@ -3832,7 +4490,7 @@ class Buffer : public Memory
         cl_mem_flags flags,
         cl_buffer_create_type buffer_create_type,
         const void * buffer_create_info,
-        cl_int * err = NULL)
+        cl_int * err = nullptr)
     {
         Buffer result;
         cl_int error;
@@ -3844,12 +4502,12 @@ class Buffer : public Memory
             &error);
 
         detail::errHandler(error, __CREATE_SUBBUFFER_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
 
         return result;
-    }		
+    }
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
 };
 
@@ -3876,7 +4534,7 @@ class BufferD3D10 : public Buffer
         const Context& context,
         cl_mem_flags flags,
         ID3D10Buffer* bufobj,
-        cl_int * err = NULL) : pfn_clCreateFromD3D10BufferKHR(nullptr)
+        cl_int * err = nullptr) : pfn_clCreateFromD3D10BufferKHR(nullptr)
     {
         typedef CL_API_ENTRY cl_mem (CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)(
             cl_context context, cl_mem_flags flags, ID3D10Buffer*  buffer,
@@ -3884,14 +4542,15 @@ class BufferD3D10 : public Buffer
         PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR;
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
         vector<cl_context_properties> props = context.getInfo<CL_CONTEXT_PROPERTIES>();
-        cl_platform platform = -1;
+        cl_platform platform = nullptr;
         for( int i = 0; i < props.size(); ++i ) {
             if( props[i] == CL_CONTEXT_PLATFORM ) {
                 platform = props[i+1];
             }
         }
         CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCreateFromD3D10BufferKHR);
-#elif CL_HPP_TARGET_OPENCL_VERSION >= 110
+#endif
+#if CL_HPP_MINIMUM_OPENCL_VERSION < 120
         CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateFromD3D10BufferKHR);
 #endif
 
@@ -3902,13 +4561,14 @@ class BufferD3D10 : public Buffer
             bufobj,
             &error);
 
+        // TODO: This should really have a D3D10 rerror code!
         detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
 
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     BufferD3D10() : Buffer() { }
 
     /*! \brief Constructor from cl_mem - takes ownership.
@@ -3930,35 +4590,6 @@ class BufferD3D10 : public Buffer
         Buffer::operator=(rhs);
         return *this;
     }
-
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    BufferD3D10(const BufferD3D10& buf) : 
-        Buffer(buf) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    BufferD3D10& operator = (const BufferD3D10 &buf)
-    {
-        Buffer::operator=(buf);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    BufferD3D10(BufferD3D10&& buf) CL_HPP_NOEXCEPT_ : Buffer(std::move(buf)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    BufferD3D10& operator = (BufferD3D10 &&buf)
-    {
-        Buffer::operator=(std::move(buf));
-        return *this;
-    }
 };
 #endif
 
@@ -3982,7 +4613,7 @@ class BufferGL : public Buffer
         const Context& context,
         cl_mem_flags flags,
         cl_GLuint bufobj,
-        cl_int * err = NULL)
+        cl_int * err = nullptr)
     {
         cl_int error;
         object_ = ::clCreateFromGLBuffer(
@@ -3992,12 +4623,12 @@ class BufferGL : public Buffer
             &error);
 
         detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
 
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     BufferGL() : Buffer() { }
 
     /*! \brief Constructor from cl_mem - takes ownership.
@@ -4020,33 +4651,6 @@ class BufferGL : public Buffer
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    BufferGL(const BufferGL& buf) : Buffer(buf) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    BufferGL& operator = (const BufferGL &buf)
-    {
-        Buffer::operator=(buf);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    BufferGL(BufferGL&& buf) CL_HPP_NOEXCEPT_ : Buffer(std::move(buf)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    BufferGL& operator = (BufferGL &&buf)
-    {
-        Buffer::operator=(std::move(buf));
-        return *this;
-    }
 
     //! \brief Wrapper for clGetGLObjectInfo().
     cl_int getObjectInfo(
@@ -4079,7 +4683,7 @@ class BufferRenderGL : public Buffer
         const Context& context,
         cl_mem_flags flags,
         cl_GLuint bufobj,
-        cl_int * err = NULL)
+        cl_int * err = nullptr)
     {
         cl_int error;
         object_ = ::clCreateFromGLRenderbuffer(
@@ -4089,12 +4693,12 @@ class BufferRenderGL : public Buffer
             &error);
 
         detail::errHandler(error, __CREATE_GL_RENDER_BUFFER_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
 
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     BufferRenderGL() : Buffer() { }
 
     /*! \brief Constructor from cl_mem - takes ownership.
@@ -4117,33 +4721,6 @@ class BufferRenderGL : public Buffer
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    BufferRenderGL(const BufferRenderGL& buf) : Buffer(buf) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    BufferRenderGL& operator = (const BufferRenderGL &buf)
-    {
-        Buffer::operator=(buf);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    BufferRenderGL(BufferRenderGL&& buf) CL_HPP_NOEXCEPT_ : Buffer(std::move(buf)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    BufferRenderGL& operator = (BufferRenderGL &&buf)
-    {
-        Buffer::operator=(std::move(buf));
-        return *this;
-    }
 
     //! \brief Wrapper for clGetGLObjectInfo().
     cl_int getObjectInfo(
@@ -4165,7 +4742,7 @@ class BufferRenderGL : public Buffer
 class Image : public Memory
 {
 protected:
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     Image() : Memory() { }
 
     /*! \brief Constructor from cl_mem - takes ownership.
@@ -4188,34 +4765,6 @@ class Image : public Memory
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image(const Image& img) : Memory(img) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image& operator = (const Image &img)
-    {
-        Memory::operator=(img);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image(Image&& img) CL_HPP_NOEXCEPT_ : Memory(std::move(img)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image& operator = (Image &&img)
-    {
-        Memory::operator=(std::move(img));
-        return *this;
-    }
-
 
 public:
     //! \brief Wrapper for clGetImageInfo().
@@ -4228,14 +4777,14 @@ class Image : public Memory
     }
     
     //! \brief Wrapper for clGetImageInfo() that returns by value.
-    template <cl_int name> typename
+    template <cl_image_info name> typename
     detail::param_traits<detail::cl_image_info, name>::param_type
-    getImageInfo(cl_int* err = NULL) const
+    getImageInfo(cl_int* err = nullptr) const
     {
         typename detail::param_traits<
             detail::cl_image_info, name>::param_type param;
         cl_int result = getImageInfo(name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
@@ -4261,16 +4810,15 @@ class Image1D : public Image
         cl_mem_flags flags,
         ImageFormat format,
         size_type width,
-        void* host_ptr = NULL,
-        cl_int* err = NULL)
+        void* host_ptr = nullptr,
+        cl_int* err = nullptr)
     {
         cl_int error;
-        cl_image_desc desc =
-        {
-            CL_MEM_OBJECT_IMAGE1D,
-            width,
-            0, 0, 0, 0, 0, 0, 0, 0
-        };
+
+        cl_image_desc desc = {};
+        desc.image_type = CL_MEM_OBJECT_IMAGE1D;
+        desc.image_width = width;
+
         object_ = ::clCreateImage(
             context(), 
             flags, 
@@ -4280,12 +4828,12 @@ class Image1D : public Image
             &error);
 
         detail::errHandler(error, __CREATE_IMAGE_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
 
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     Image1D() { }
 
     /*! \brief Constructor from cl_mem - takes ownership.
@@ -4308,33 +4856,6 @@ class Image1D : public Image
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1D(const Image1D& img) : Image(img) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1D& operator = (const Image1D &img)
-    {
-        Image::operator=(img);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1D(Image1D&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1D& operator = (Image1D &&img)
-    {
-        Image::operator=(std::move(img));
-        return *this;
-    }
 
 };
 
@@ -4350,26 +4871,25 @@ class Image1DBuffer : public Image
         ImageFormat format,
         size_type width,
         const Buffer &buffer,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
-        cl_image_desc desc =
-        {
-            CL_MEM_OBJECT_IMAGE1D_BUFFER,
-            width,
-            0, 0, 0, 0, 0, 0, 0,
-            buffer()
-        };
+
+        cl_image_desc desc = {};
+        desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+        desc.image_width = width;
+        desc.buffer = buffer();
+
         object_ = ::clCreateImage(
             context(), 
             flags, 
             &format, 
             &desc, 
-            NULL, 
+            nullptr, 
             &error);
 
         detail::errHandler(error, __CREATE_IMAGE_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -4392,33 +4912,7 @@ class Image1DBuffer : public Image
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1DBuffer(const Image1DBuffer& img) : Image(img) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1DBuffer& operator = (const Image1DBuffer &img)
-    {
-        Image::operator=(img);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1DBuffer(Image1DBuffer&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
 
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1DBuffer& operator = (Image1DBuffer &&img)
-    {
-        Image::operator=(std::move(img));
-        return *this;
-    }
 
 };
 
@@ -4435,19 +4929,17 @@ class Image1DArray : public Image
         size_type arraySize,
         size_type width,
         size_type rowPitch,
-        void* host_ptr = NULL,
-        cl_int* err = NULL)
+        void* host_ptr = nullptr,
+        cl_int* err = nullptr)
     {
         cl_int error;
-        cl_image_desc desc =
-        {
-            CL_MEM_OBJECT_IMAGE1D_ARRAY,
-            width,
-            0, 0,  // height, depth (unused)
-            arraySize,
-            rowPitch,
-            0, 0, 0, 0
-        };
+
+        cl_image_desc desc = {};
+        desc.image_type = CL_MEM_OBJECT_IMAGE1D_ARRAY;
+        desc.image_width = width;
+        desc.image_array_size = arraySize;
+        desc.image_row_pitch = rowPitch;
+
         object_ = ::clCreateImage(
             context(), 
             flags, 
@@ -4457,7 +4949,7 @@ class Image1DArray : public Image
             &error);
 
         detail::errHandler(error, __CREATE_IMAGE_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -4481,33 +4973,6 @@ class Image1DArray : public Image
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1DArray(const Image1DArray& img) : Image(img) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1DArray& operator = (const Image1DArray &img)
-    {
-        Image::operator=(img);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1DArray(Image1DArray&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1DArray& operator = (Image1DArray &&img)
-    {
-        Image::operator=(std::move(img));
-        return *this;
-    }
 
 };
 #endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 120
@@ -4533,8 +4998,8 @@ class Image2D : public Image
         size_type width,
         size_type height,
         size_type row_pitch = 0,
-        void* host_ptr = NULL,
-        cl_int* err = NULL)
+        void* host_ptr = nullptr,
+        cl_int* err = nullptr)
     {
         cl_int error;
         bool useCreateImage;
@@ -4554,15 +5019,12 @@ class Image2D : public Image
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
         if (useCreateImage)
         {
-            cl_image_desc desc =
-            {
-                CL_MEM_OBJECT_IMAGE2D,
-                width,
-                height,
-                0, 0, // depth, array size (unused)
-                row_pitch,
-                0, 0, 0, 0
-            };
+            cl_image_desc desc = {};
+            desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+            desc.image_width = width;
+            desc.image_height = height;
+            desc.image_row_pitch = row_pitch;
+
             object_ = ::clCreateImage(
                 context(),
                 flags,
@@ -4572,7 +5034,7 @@ class Image2D : public Image
                 &error);
 
             detail::errHandler(error, __CREATE_IMAGE_ERR);
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
         }
@@ -4584,17 +5046,20 @@ class Image2D : public Image
                 context(), flags,&format, width, height, row_pitch, host_ptr, &error);
 
             detail::errHandler(error, __CREATE_IMAGE2D_ERR);
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
         }
 #endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120
     }
 
-#if CL_HPP_TARGET_OPENCL_VERSION >= 200 || defined(CL_HPP_USE_CL_IMAGE2D_FROM_BUFFER_KHR)
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
     /*! \brief Constructs a 2D Image from a buffer.
     * \note This will share storage with the underlying buffer.
     *
+    *  Requires OpenCL 2.0 or newer or OpenCL 1.2 and the 
+    *  cl_khr_image2d_from_buffer extension.
+    *
     *  Wraps clCreateImage().
     */
     Image2D(
@@ -4608,17 +5073,13 @@ class Image2D : public Image
     {
         cl_int error;
 
-        cl_image_desc desc =
-        {
-            CL_MEM_OBJECT_IMAGE2D,
-            width,
-            height,
-            0, 0, // depth, array size (unused)
-            row_pitch,
-            0, 0, 0,
-            // Use buffer as input to image
-            sourceBuffer()
-        };
+        cl_image_desc desc = {};
+        desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+        desc.image_width = width;
+        desc.image_height = height;
+        desc.image_row_pitch = row_pitch;
+        desc.buffer = sourceBuffer();
+
         object_ = ::clCreateImage(
             context(),
             0, // flags inherited from buffer
@@ -4632,7 +5093,7 @@ class Image2D : public Image
             *err = error;
         }
     }
-#endif //#if CL_HPP_TARGET_OPENCL_VERSION >= 200 || defined(CL_HPP_USE_CL_IMAGE2D_FROM_BUFFER_KHR)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
 
 #if CL_HPP_TARGET_OPENCL_VERSION >= 200
     /*! \brief Constructs a 2D Image from an image.
@@ -4672,19 +5133,16 @@ class Image2D : public Image
         // Update only the channel order. 
         // Channel format inherited from source.
         sourceFormat.image_channel_order = order;
-        cl_image_desc desc =
-        {
-            CL_MEM_OBJECT_IMAGE2D,
-            sourceWidth,
-            sourceHeight,
-            0, 0, // depth (unused), array size (unused)
-            sourceRowPitch,
-            0, // slice pitch (unused)
-            sourceNumMIPLevels,
-            sourceNumSamples,
-            // Use buffer as input to image
-            sourceImage()
-        };
+
+        cl_image_desc desc = {};
+        desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+        desc.image_width = sourceWidth;
+        desc.image_height = sourceHeight;
+        desc.image_row_pitch = sourceRowPitch;
+        desc.num_mip_levels = sourceNumMIPLevels;
+        desc.num_samples = sourceNumSamples;
+        desc.buffer = sourceImage();
+
         object_ = ::clCreateImage(
             context(),
             0, // flags should be inherited from mem_object
@@ -4700,7 +5158,7 @@ class Image2D : public Image
     }
 #endif //#if CL_HPP_TARGET_OPENCL_VERSION >= 200
 
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     Image2D() { }
 
     /*! \brief Constructor from cl_mem - takes ownership.
@@ -4723,33 +5181,8 @@ class Image2D : public Image
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2D(const Image2D& img) : Image(img) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2D& operator = (const Image2D &img)
-    {
-        Image::operator=(img);
-        return *this;
-    }
 
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2D(Image2D&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
 
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2D& operator = (Image2D &&img)
-    {
-        Image::operator=(std::move(img));
-        return *this;
-    }
 
 };
 
@@ -4764,7 +5197,7 @@ class Image2D : public Image
  *  \see Memory
  *  \note Deprecated for OpenCL 1.2. Please use ImageGL instead.
  */
-class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL : public Image2D 
+class CL_API_PREFIX__VERSION_1_1_DEPRECATED Image2DGL : public Image2D 
 {
 public:
     /*! \brief Constructs an Image2DGL in a specified context, from a given
@@ -4778,7 +5211,7 @@ class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL : public Image2D
         cl_GLenum target,
         cl_GLint  miplevel,
         cl_GLuint texobj,
-        cl_int * err = NULL)
+        cl_int * err = nullptr)
     {
         cl_int error;
         object_ = ::clCreateFromGLTexture2D(
@@ -4790,13 +5223,13 @@ class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL : public Image2D
             &error);
 
         detail::errHandler(error, __CREATE_GL_TEXTURE_2D_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
 
     }
     
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     Image2DGL() : Image2D() { }
 
     /*! \brief Constructor from cl_mem - takes ownership.
@@ -4819,35 +5252,9 @@ class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL : public Image2D
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2DGL(const Image2DGL& img) : Image2D(img) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2DGL& operator = (const Image2DGL &img)
-    {
-        Image2D::operator=(img);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2DGL(Image2DGL&& img) CL_HPP_NOEXCEPT_ : Image2D(std::move(img)) {}
 
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2DGL& operator = (Image2DGL &&img)
-    {
-        Image2D::operator=(std::move(img));
-        return *this;
-    }
 
-} CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+} CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
 #endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS
 
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
@@ -4866,21 +5273,19 @@ class Image2DArray : public Image
         size_type height,
         size_type rowPitch,
         size_type slicePitch,
-        void* host_ptr = NULL,
-        cl_int* err = NULL)
+        void* host_ptr = nullptr,
+        cl_int* err = nullptr)
     {
         cl_int error;
-        cl_image_desc desc =
-        {
-            CL_MEM_OBJECT_IMAGE2D_ARRAY,
-            width,
-            height,
-            0,       // depth (unused)
-            arraySize,
-            rowPitch,
-            slicePitch,
-            0, 0, 0
-        };
+
+        cl_image_desc desc = {};
+        desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY;
+        desc.image_width = width;
+        desc.image_height = height;
+        desc.image_array_size = arraySize;
+        desc.image_row_pitch = rowPitch;
+        desc.image_slice_pitch = slicePitch;
+
         object_ = ::clCreateImage(
             context(), 
             flags, 
@@ -4890,7 +5295,7 @@ class Image2DArray : public Image
             &error);
 
         detail::errHandler(error, __CREATE_IMAGE_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -4912,33 +5317,6 @@ class Image2DArray : public Image
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2DArray(const Image2DArray& img) : Image(img) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2DArray& operator = (const Image2DArray &img)
-    {
-        Image::operator=(img);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2DArray(Image2DArray&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2DArray& operator = (Image2DArray &&img)
-    {
-        Image::operator=(std::move(img));
-        return *this;
-    }
 };
 #endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 120
 
@@ -4964,8 +5342,8 @@ class Image3D : public Image
         size_type depth,
         size_type row_pitch = 0,
         size_type slice_pitch = 0,
-        void* host_ptr = NULL,
-        cl_int* err = NULL)
+        void* host_ptr = nullptr,
+        cl_int* err = nullptr)
     {
         cl_int error;
         bool useCreateImage;
@@ -4985,17 +5363,14 @@ class Image3D : public Image
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
         if (useCreateImage)
         {
-            cl_image_desc desc =
-            {
-                CL_MEM_OBJECT_IMAGE3D,
-                width,
-                height,
-                depth,
-                0,      // array size (unused)
-                row_pitch,
-                slice_pitch,
-                0, 0, 0
-            };
+            cl_image_desc desc = {};
+            desc.image_type = CL_MEM_OBJECT_IMAGE3D;
+            desc.image_width = width;
+            desc.image_height = height;
+            desc.image_depth = depth;
+            desc.image_row_pitch = row_pitch;
+            desc.image_slice_pitch = slice_pitch;
+
             object_ = ::clCreateImage(
                 context(), 
                 flags, 
@@ -5005,7 +5380,7 @@ class Image3D : public Image
                 &error);
 
             detail::errHandler(error, __CREATE_IMAGE_ERR);
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
         }
@@ -5018,14 +5393,14 @@ class Image3D : public Image
                 slice_pitch, host_ptr, &error);
 
             detail::errHandler(error, __CREATE_IMAGE3D_ERR);
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
         }
 #endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120
     }
 
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     Image3D() : Image() { }
 
     /*! \brief Constructor from cl_mem - takes ownership.
@@ -5048,33 +5423,6 @@ class Image3D : public Image
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image3D(const Image3D& img) : Image(img) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image3D& operator = (const Image3D &img)
-    {
-        Image::operator=(img);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image3D(Image3D&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image3D& operator = (Image3D &&img)
-    {
-        Image::operator=(std::move(img));
-        return *this;
-    }
 };
 
 #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
@@ -5100,7 +5448,7 @@ class Image3DGL : public Image3D
         cl_GLenum target,
         cl_GLint  miplevel,
         cl_GLuint texobj,
-        cl_int * err = NULL)
+        cl_int * err = nullptr)
     {
         cl_int error;
         object_ = ::clCreateFromGLTexture3D(
@@ -5112,12 +5460,12 @@ class Image3DGL : public Image3D
             &error);
 
         detail::errHandler(error, __CREATE_GL_TEXTURE_3D_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
 
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     Image3DGL() : Image3D() { }
 
     /*! \brief Constructor from cl_mem - takes ownership.
@@ -5140,33 +5488,6 @@ class Image3DGL : public Image3D
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image3DGL(const Image3DGL& img) : Image3D(img) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image3DGL& operator = (const Image3DGL &img)
-    {
-        Image3D::operator=(img);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image3DGL(Image3DGL&& img) CL_HPP_NOEXCEPT_ : Image3D(std::move(img)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image3DGL& operator = (Image3DGL &&img)
-    {
-        Image3D::operator=(std::move(img));
-        return *this;
-    }
 };
 #endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS
 
@@ -5186,7 +5507,7 @@ class ImageGL : public Image
         cl_GLenum target,
         cl_GLint  miplevel,
         cl_GLuint texobj,
-        cl_int * err = NULL)
+        cl_int * err = nullptr)
     {
         cl_int error;
         object_ = ::clCreateFromGLTexture(
@@ -5198,7 +5519,7 @@ class ImageGL : public Image
             &error);
 
         detail::errHandler(error, __CREATE_GL_TEXTURE_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -5221,33 +5542,6 @@ class ImageGL : public Image
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    ImageGL(const ImageGL& img) : Image(img) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    ImageGL& operator = (const ImageGL &img)
-    {
-        Image::operator=(img);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    ImageGL(ImageGL&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    ImageGL& operator = (ImageGL &&img)
-    {
-        Image::operator=(std::move(img));
-        return *this;
-    }
 };
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
 
@@ -5277,7 +5571,7 @@ class Pipe : public Memory
         const Context& context,
         cl_uint packet_size,
         cl_uint max_packets,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
 
@@ -5285,7 +5579,7 @@ class Pipe : public Memory
         object_ = ::clCreatePipe(context(), flags, packet_size, max_packets, nullptr, &error);
 
         detail::errHandler(error, __CREATE_PIPE_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -5301,7 +5595,7 @@ class Pipe : public Memory
     Pipe(
         cl_uint packet_size,
         cl_uint max_packets,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
 
@@ -5311,12 +5605,12 @@ class Pipe : public Memory
         object_ = ::clCreatePipe(context(), flags, packet_size, max_packets, nullptr, &error);
 
         detail::errHandler(error, __CREATE_PIPE_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
 
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     Pipe() : Memory() { }
 
     /*! \brief Constructor from cl_mem - takes ownership.
@@ -5339,33 +5633,7 @@ class Pipe : public Memory
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Pipe(const Pipe& pipe) : Memory(pipe) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Pipe& operator = (const Pipe &pipe)
-    {
-        Memory::operator=(pipe);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Pipe(Pipe&& pipe) CL_HPP_NOEXCEPT_ : Memory(std::move(pipe)) {}
 
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Pipe& operator = (Pipe &&pipe)
-    {
-        Memory::operator=(std::move(pipe));
-        return *this;
-    }
 
     //! \brief Wrapper for clGetMemObjectInfo().
     template <typename T>
@@ -5377,14 +5645,14 @@ class Pipe : public Memory
     }
 
     //! \brief Wrapper for clGetMemObjectInfo() that returns by value.
-    template <cl_int name> typename
+    template <cl_pipe_info name> typename
         detail::param_traits<detail::cl_pipe_info, name>::param_type
-        getInfo(cl_int* err = NULL) const
+        getInfo(cl_int* err = nullptr) const
     {
         typename detail::param_traits<
             detail::cl_pipe_info, name>::param_type param;
         cl_int result = getInfo(name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
@@ -5404,7 +5672,7 @@ class Pipe : public Memory
 class Sampler : public detail::Wrapper<cl_sampler>
 {
 public:
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     Sampler() { }
 
     /*! \brief Constructs a Sampler in a specified context.
@@ -5416,7 +5684,7 @@ class Sampler : public detail::Wrapper<cl_sampler>
         cl_bool normalized_coords,
         cl_addressing_mode addressing_mode,
         cl_filter_mode filter_mode,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
 
@@ -5432,7 +5700,7 @@ class Sampler : public detail::Wrapper<cl_sampler>
             &error);
 
         detail::errHandler(error, __CREATE_SAMPLER_WITH_PROPERTIES_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
 #else
@@ -5444,7 +5712,7 @@ class Sampler : public detail::Wrapper<cl_sampler>
             &error);
 
         detail::errHandler(error, __CREATE_SAMPLER_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
 #endif        
@@ -5472,33 +5740,7 @@ class Sampler : public detail::Wrapper<cl_sampler>
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Sampler(const Sampler& sam) : detail::Wrapper<cl_type>(sam) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Sampler& operator = (const Sampler &sam)
-    {
-        detail::Wrapper<cl_type>::operator=(sam);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Sampler(Sampler&& sam) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(sam)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Sampler& operator = (Sampler &&sam)
-    {
-        detail::Wrapper<cl_type>::operator=(std::move(sam));
-        return *this;
-    }
+  
 
     //! \brief Wrapper for clGetSamplerInfo().
     template <typename T>
@@ -5510,14 +5752,14 @@ class Sampler : public detail::Wrapper<cl_sampler>
     }
 
     //! \brief Wrapper for clGetSamplerInfo() that returns by value.
-    template <cl_int name> typename
+    template <cl_sampler_info name> typename
     detail::param_traits<detail::cl_sampler_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
+    getInfo(cl_int* err = nullptr) const
     {
         typename detail::param_traits<
             detail::cl_sampler_info, name>::param_type param;
         cl_int result = getInfo(name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
@@ -5573,6 +5815,15 @@ class NDRange
         sizes_[2] = size2;
     }
 
+    //! \brief Constructs one-dimensional range.
+    NDRange(array<size_type, 1> a) : NDRange(a[0]){}
+
+    //! \brief Constructs two-dimensional range.
+    NDRange(array<size_type, 2> a) : NDRange(a[0], a[1]){}
+
+    //! \brief Constructs three-dimensional range.
+    NDRange(array<size_type, 3> a) : NDRange(a[0], a[1], a[2]){}
+
     /*! \brief Conversion operator to const size_type *.
      *  
      *  \returns a pointer to the size of the first dimension.
@@ -5643,7 +5894,7 @@ template <>
 struct KernelArgumentHandler<LocalSpaceArg, void>
 {
     static size_type size(const LocalSpaceArg& value) { return value.size_; }
-    static const void* ptr(const LocalSpaceArg&) { return NULL; }
+    static const void* ptr(const LocalSpaceArg&) { return nullptr; }
 };
 
 } 
@@ -5670,9 +5921,10 @@ Local(size_type size)
 class Kernel : public detail::Wrapper<cl_kernel>
 {
 public:
-    inline Kernel(const Program& program, const char* name, cl_int* err = NULL);
+    inline Kernel(const Program& program, const string& name, cl_int* err = nullptr);
+    inline Kernel(const Program& program, const char* name, cl_int* err = nullptr);
 
-    //! \brief Default constructor - initializes to NULL.
+    //! \brief Default constructor - initializes to nullptr.
     Kernel() { }
 
     /*! \brief Constructor from cl_kernel - takes ownership.
@@ -5697,33 +5949,8 @@ class Kernel : public detail::Wrapper<cl_kernel>
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Kernel(const Kernel& kernel) : detail::Wrapper<cl_type>(kernel) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Kernel& operator = (const Kernel &kernel)
-    {
-        detail::Wrapper<cl_type>::operator=(kernel);
-        return *this;
-    }
 
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Kernel(Kernel&& kernel) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(kernel)) {}
 
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Kernel& operator = (Kernel &&kernel)
-    {
-        detail::Wrapper<cl_type>::operator=(std::move(kernel));
-        return *this;
-    }
 
     template <typename T>
     cl_int getInfo(cl_kernel_info name, T* param) const
@@ -5733,14 +5960,14 @@ class Kernel : public detail::Wrapper<cl_kernel>
             __GET_KERNEL_INFO_ERR);
     }
 
-    template <cl_int name> typename
+    template <cl_kernel_info name> typename
     detail::param_traits<detail::cl_kernel_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
+    getInfo(cl_int* err = nullptr) const
     {
         typename detail::param_traits<
             detail::cl_kernel_info, name>::param_type param;
         cl_int result = getInfo(name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
@@ -5755,14 +5982,14 @@ class Kernel : public detail::Wrapper<cl_kernel>
             __GET_KERNEL_ARG_INFO_ERR);
     }
 
-    template <cl_int name> typename
+    template <cl_kernel_arg_info name> typename
     detail::param_traits<detail::cl_kernel_arg_info, name>::param_type
-    getArgInfo(cl_uint argIndex, cl_int* err = NULL) const
+    getArgInfo(cl_uint argIndex, cl_int* err = nullptr) const
     {
         typename detail::param_traits<
             detail::cl_kernel_arg_info, name>::param_type param;
         cl_int result = getArgInfo(argIndex, name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
@@ -5779,44 +6006,52 @@ class Kernel : public detail::Wrapper<cl_kernel>
                 __GET_KERNEL_WORK_GROUP_INFO_ERR);
     }
 
-    template <cl_int name> typename
+    template <cl_kernel_work_group_info name> typename
     detail::param_traits<detail::cl_kernel_work_group_info, name>::param_type
-        getWorkGroupInfo(const Device& device, cl_int* err = NULL) const
+        getWorkGroupInfo(const Device& device, cl_int* err = nullptr) const
     {
         typename detail::param_traits<
         detail::cl_kernel_work_group_info, name>::param_type param;
         cl_int result = getWorkGroupInfo(device, name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
     }
     
-#if CL_HPP_TARGET_OPENCL_VERSION >= 200
-#if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR)
+#if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR) || CL_HPP_TARGET_OPENCL_VERSION >= 210
     cl_int getSubGroupInfo(const cl::Device &dev, cl_kernel_sub_group_info name, const cl::NDRange &range, size_type* param) const
     {
+#if CL_HPP_TARGET_OPENCL_VERSION >= 210
+
+        return detail::errHandler(
+            clGetKernelSubGroupInfo(object_, dev(), name, range.size(), range.get(), sizeof(size_type), param, nullptr),
+            __GET_KERNEL_SUB_GROUP_INFO_ERR);
+
+#else // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
+
         typedef clGetKernelSubGroupInfoKHR_fn PFN_clGetKernelSubGroupInfoKHR;
-        static PFN_clGetKernelSubGroupInfoKHR pfn_clGetKernelSubGroupInfoKHR = NULL;
+        static PFN_clGetKernelSubGroupInfoKHR pfn_clGetKernelSubGroupInfoKHR = nullptr;
         CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetKernelSubGroupInfoKHR);
 
         return detail::errHandler(
             pfn_clGetKernelSubGroupInfoKHR(object_, dev(), name, range.size(), range.get(), sizeof(size_type), param, nullptr),
-            __GET_KERNEL_ARG_INFO_ERR);
+            __GET_KERNEL_SUB_GROUP_INFO_ERR);
+
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
     }
 
-    template <cl_int name>
-        size_type getSubGroupInfo(const cl::Device &dev, const cl::NDRange &range, cl_int* err = NULL) const
+    template <cl_kernel_sub_group_info name>
+        size_type getSubGroupInfo(const cl::Device &dev, const cl::NDRange &range, cl_int* err = nullptr) const
     {
         size_type param;
         cl_int result = getSubGroupInfo(dev, name, range, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
     }
-#endif // #if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR)
-#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+#endif // defined(CL_HPP_USE_CL_SUB_GROUPS_KHR) || CL_HPP_TARGET_OPENCL_VERSION >= 210
 
 #if CL_HPP_TARGET_OPENCL_VERSION >= 200
     /*! \brief setArg overload taking a shared_ptr type
@@ -5969,7 +6204,40 @@ class Kernel : public detail::Wrapper<cl_kernel>
             sizeof(void*)*(1 + sizeof...(Ts)),
             pointerList.data()));
     }
+
+    template<typename T>
+    cl_int setExecInfo(cl_kernel_exec_info param_name, const T& val)
+    {
+        return detail::errHandler(
+            ::clSetKernelExecInfo(
+            object_,
+            param_name,
+            sizeof(T),
+            &val));
+    }
+
+    template<cl_kernel_exec_info name>
+    cl_int setExecInfo(typename detail::param_traits<detail::cl_kernel_exec_info, name>::param_type& val)
+    {
+        return setExecInfo(name, val);
+    }
 #endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 210
+    /**
+     * Make a deep copy of the kernel object including its arguments.
+     * @return A new kernel object with internal state entirely separate from that
+     *         of the original but with any arguments set on the original intact.
+     */
+    Kernel clone()
+    {
+        cl_int error;
+        Kernel retValue(clCloneKernel(this->get(), &error));
+
+        detail::errHandler(error, __CLONE_KERNEL_ERR);
+        return retValue;
+    }
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
 };
 
 /*! \class Program
@@ -5989,7 +6257,7 @@ class Program : public detail::Wrapper<cl_program>
     Program(
         const string& source,
         bool build = false,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
 
@@ -6008,19 +6276,19 @@ class Program : public detail::Wrapper<cl_program>
             error = ::clBuildProgram(
                 object_,
                 0,
-                NULL,
+                nullptr,
 #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
                 "-cl-std=CL2.0",
 #else
                 "",
 #endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
-                NULL,
-                NULL);
+                nullptr,
+                nullptr);
 
             detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
         }
 
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -6029,7 +6297,7 @@ class Program : public detail::Wrapper<cl_program>
         const Context& context,
         const string& source,
         bool build = false,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
 
@@ -6045,19 +6313,19 @@ class Program : public detail::Wrapper<cl_program>
             error = ::clBuildProgram(
                 object_,
                 0,
-                NULL,
+                nullptr,
 #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
                 "-cl-std=CL2.0",
 #else
                 "",
 #endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
-                NULL,
-                NULL);
+                nullptr,
+                nullptr);
             
             detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
         }
 
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -6068,7 +6336,7 @@ class Program : public detail::Wrapper<cl_program>
      */
     Program(
         const Sources& sources,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
         Context context = Context::getDefault(err);
@@ -6092,7 +6360,7 @@ class Program : public detail::Wrapper<cl_program>
             context(), (cl_uint)n, strings.data(), lengths.data(), &error);
 
         detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -6104,7 +6372,7 @@ class Program : public detail::Wrapper<cl_program>
     Program(
         const Context& context,
         const Sources& sources,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
 
@@ -6127,10 +6395,119 @@ class Program : public detail::Wrapper<cl_program>
             context(), (cl_uint)n, strings.data(), lengths.data(), &error);
 
         detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
+            *err = error;
+        }
+    }
+
+#if defined(CL_HPP_USE_IL_KHR) || CL_HPP_TARGET_OPENCL_VERSION >= 210
+    /**
+     * Program constructor to allow construction of program from SPIR-V or another IL.
+     *
+     * Requires OpenCL 2.1 or newer or the cl_khr_il_program extension.
+     */
+    Program(
+        const vector<char>& IL,
+        bool build = false,
+        cl_int* err = nullptr)
+    {
+        cl_int error;
+
+        Context context = Context::getDefault(err);
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 210
+
+        object_ = ::clCreateProgramWithIL(
+            context(), static_cast<const void*>(IL.data()), IL.size(), &error);
+
+#else // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
+
+        typedef clCreateProgramWithILKHR_fn PFN_clCreateProgramWithILKHR;
+        static PFN_clCreateProgramWithILKHR pfn_clCreateProgramWithILKHR = nullptr;
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateProgramWithILKHR);
+
+        object_ = pfn_clCreateProgramWithILKHR(
+                context(), static_cast<const void*>(IL.data()), IL.size(), &error);
+
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_IL_ERR);
+
+        if (error == CL_SUCCESS && build) {
+
+            error = ::clBuildProgram(
+                object_,
+                0,
+                nullptr,
+#if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
+                "-cl-std=CL2.0",
+#else
+                "",
+#endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
+                nullptr,
+                nullptr);
+
+            detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
+        }
+
+        if (err != nullptr) {
+            *err = error;
+        }
+    }
+
+    /**
+     * Program constructor to allow construction of program from SPIR-V or another IL
+     * for a specific context.
+     *
+     * Requires OpenCL 2.1 or newer or the cl_khr_il_program extension.
+     */
+    Program(
+        const Context& context,
+        const vector<char>& IL,
+        bool build = false,
+        cl_int* err = nullptr)
+    {
+        cl_int error;
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 210
+
+        object_ = ::clCreateProgramWithIL(
+            context(), static_cast<const void*>(IL.data()), IL.size(), &error);
+
+#else // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
+
+        typedef clCreateProgramWithILKHR_fn PFN_clCreateProgramWithILKHR;
+        static PFN_clCreateProgramWithILKHR pfn_clCreateProgramWithILKHR = nullptr;
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateProgramWithILKHR);
+
+        object_ = pfn_clCreateProgramWithILKHR(
+            context(), static_cast<const void*>(IL.data()), IL.size(), &error);
+
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_IL_ERR);
+
+        if (error == CL_SUCCESS && build) {
+            error = ::clBuildProgram(
+                object_,
+                0,
+                nullptr,
+#if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
+                "-cl-std=CL2.0",
+#else
+                "",
+#endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
+                nullptr,
+                nullptr);
+
+            detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
+        }
+
+        if (err != nullptr) {
             *err = error;
         }
     }
+#endif // defined(CL_HPP_USE_IL_KHR) || CL_HPP_TARGET_OPENCL_VERSION >= 210
 
     /**
      * Construct a program object from a list of devices and a per-device list of binaries.
@@ -6141,12 +6518,12 @@ class Program : public detail::Wrapper<cl_program>
      *   match the size of binaries and filled with values to specify if each binary
      *   was successfully loaded.
      *   Set to CL_SUCCESS if the binary was successfully loaded.
-     *   Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is NULL.
+     *   Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is nullptr.
      *   Set to CL_INVALID_BINARY if the binary provided is not valid for the matching device.
-     * \param err if non-NULL will be set to CL_SUCCESS on successful operation or one of the following errors:
+     * \param err if non-nullptr will be set to CL_SUCCESS on successful operation or one of the following errors:
      *   CL_INVALID_CONTEXT if context is not a valid context.
      *   CL_INVALID_VALUE if the length of devices is zero; or if the length of binaries does not match the length of devices; 
-     *     or if any entry in binaries is NULL or has length 0.
+     *     or if any entry in binaries is nullptr or has length 0.
      *   CL_INVALID_DEVICE if OpenCL devices listed in devices are not in the list of devices associated with context.
      *   CL_INVALID_BINARY if an invalid program binary was encountered for any device. binaryStatus will return specific status for each device.
      *   CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host.
@@ -6155,8 +6532,8 @@ class Program : public detail::Wrapper<cl_program>
         const Context& context,
         const vector<Device>& devices,
         const Binaries& binaries,
-        vector<cl_int>* binaryStatus = NULL,
-        cl_int* err = NULL)
+        vector<cl_int>* binaryStatus = nullptr,
+        cl_int* err = nullptr)
     {
         cl_int error;
         
@@ -6166,13 +6543,12 @@ class Program : public detail::Wrapper<cl_program>
         if(binaries.size() != numDevices) {
             error = CL_INVALID_VALUE;
             detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
             return;
         }
 
-
         vector<size_type> lengths(numDevices);
         vector<const unsigned char*> images(numDevices);
 #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
@@ -6186,7 +6562,7 @@ class Program : public detail::Wrapper<cl_program>
             lengths[i] = binaries[(int)i].second;
         }
 #endif // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
-        
+
         vector<cl_device_id> deviceIDs(numDevices);
         for( size_type deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
             deviceIDs[deviceIndex] = (devices[deviceIndex])();
@@ -6195,16 +6571,16 @@ class Program : public detail::Wrapper<cl_program>
         if(binaryStatus) {
             binaryStatus->resize(numDevices);
         }
-        
+
         object_ = ::clCreateProgramWithBinary(
             context(), (cl_uint) devices.size(),
             deviceIDs.data(),
-            lengths.data(), images.data(), (binaryStatus != NULL && numDevices > 0)
+            lengths.data(), images.data(), (binaryStatus != nullptr && numDevices > 0)
                ? &binaryStatus->front()
-               : NULL, &error);
+               : nullptr, &error);
 
         detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -6219,7 +6595,7 @@ class Program : public detail::Wrapper<cl_program>
         const Context& context,
         const vector<Device>& devices,
         const string& kernelNames,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
 
@@ -6238,7 +6614,7 @@ class Program : public detail::Wrapper<cl_program>
             &error);
 
         detail::errHandler(error, __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -6247,7 +6623,7 @@ class Program : public detail::Wrapper<cl_program>
     Program() { }
     
 
-    /*! \brief Constructor from cl_mem - takes ownership.
+    /*! \brief Constructor from cl_program - takes ownership.
      *
      * \param retainObject will cause the constructor to retain its cl object.
      *                     Defaults to false to maintain compatibility with
@@ -6262,43 +6638,24 @@ class Program : public detail::Wrapper<cl_program>
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Program(const Program& program) : detail::Wrapper<cl_type>(program) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Program& operator = (const Program &program)
-    {
-        detail::Wrapper<cl_type>::operator=(program);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Program(Program&& program) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(program)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Program& operator = (Program &&program)
+    cl_int build(
+        const vector<Device>& devices,
+        const string& options,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
+        void* data = nullptr) const
     {
-        detail::Wrapper<cl_type>::operator=(std::move(program));
-        return *this;
+        return build(devices, options.c_str(), notifyFptr, data);
     }
 
     cl_int build(
         const vector<Device>& devices,
-        const char* options = NULL,
-        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
-        void* data = NULL) const
+        const char* options = nullptr,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
+        void* data = nullptr) const
     {
         size_type numDevices = devices.size();
         vector<cl_device_id> deviceIDs(numDevices);
-        
+
         for( size_type deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
             deviceIDs[deviceIndex] = (devices[deviceIndex])();
         }
@@ -6316,36 +6673,159 @@ class Program : public detail::Wrapper<cl_program>
     }
 
     cl_int build(
-        const char* options = NULL,
-        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
-        void* data = NULL) const
+        const Device& device,
+        const string& options,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
+        void* data = nullptr) const
+    {
+        return build(device, options.c_str(), notifyFptr, data);
+    }
+
+    cl_int build(
+        const Device& device,
+        const char* options = nullptr,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
+        void* data = nullptr) const
     {
+        cl_device_id deviceID = device();
+
         cl_int buildError = ::clBuildProgram(
             object_,
-            0,
-            NULL,
+            1,
+            &deviceID,
             options,
             notifyFptr,
             data);
 
+        BuildLogType buildLog(0);
+        buildLog.push_back(std::make_pair(device, getBuildInfo<CL_PROGRAM_BUILD_LOG>(device)));
+        return detail::buildErrHandler(buildError, __BUILD_PROGRAM_ERR, buildLog);
+    }
+
+    cl_int build(
+        const string& options,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
+        void* data = nullptr) const
+    {
+        return build(options.c_str(), notifyFptr, data);
+    }
+
+    cl_int build(
+        const char* options = nullptr,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
+        void* data = nullptr) const
+    {
+        cl_int buildError = ::clBuildProgram(
+            object_,
+            0,
+            nullptr,
+            options,
+            notifyFptr,
+            data);
 
         return detail::buildErrHandler(buildError, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
     }
 
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
     cl_int compile(
-        const char* options = NULL,
-        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
-        void* data = NULL) const
+        const string& options,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
+        void* data = nullptr) const
+    {
+        return compile(options.c_str(), notifyFptr, data);
+    }
+
+    cl_int compile(
+        const char* options = nullptr,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
+        void* data = nullptr) const
     {
         cl_int error = ::clCompileProgram(
             object_,
             0,
-            NULL,
+            nullptr,
             options,
             0,
-            NULL,
-            NULL,
+            nullptr,
+            nullptr,
+            notifyFptr,
+            data);
+        return detail::buildErrHandler(error, __COMPILE_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
+    }
+
+    cl_int compile(
+        const string& options,
+        const vector<Program>& inputHeaders,
+        const vector<string>& headerIncludeNames,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
+        void* data = nullptr) const
+    {
+        return compile(options.c_str(), inputHeaders, headerIncludeNames, notifyFptr, data);
+    }
+
+    cl_int compile(
+        const char* options,
+        const vector<Program>& inputHeaders,
+        const vector<string>& headerIncludeNames,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
+        void* data = nullptr) const
+    {
+        static_assert(sizeof(cl::Program) == sizeof(cl_program),
+            "Size of cl::Program must be equal to size of cl_program");
+        vector<const char*> headerIncludeNamesCStr;
+        for(const string& name: headerIncludeNames) {
+            headerIncludeNamesCStr.push_back(name.c_str());
+        }
+        cl_int error = ::clCompileProgram(
+            object_,
+            0,
+            nullptr,
+            options,
+            static_cast<cl_uint>(inputHeaders.size()),
+            reinterpret_cast<const cl_program*>(inputHeaders.data()),
+            reinterpret_cast<const char**>(headerIncludeNamesCStr.data()),
+            notifyFptr,
+            data);
+        return detail::buildErrHandler(error, __COMPILE_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
+    }
+
+    cl_int compile(
+        const string& options,
+        const vector<Device>& deviceList,
+        const vector<Program>& inputHeaders = vector<Program>(),
+        const vector<string>& headerIncludeNames = vector<string>(),
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
+        void* data = nullptr) const
+    {
+        return compile(options.c_str(), deviceList, inputHeaders, headerIncludeNames, notifyFptr, data);
+    }
+
+    cl_int compile(
+        const char* options,
+        const vector<Device>& deviceList,
+        const vector<Program>& inputHeaders = vector<Program>(),
+        const vector<string>& headerIncludeNames = vector<string>(),
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
+        void* data = nullptr) const
+    {
+        static_assert(sizeof(cl::Program) == sizeof(cl_program),
+            "Size of cl::Program must be equal to size of cl_program");
+        vector<const char*> headerIncludeNamesCStr;
+        for(const string& name: headerIncludeNames) {
+            headerIncludeNamesCStr.push_back(name.c_str());
+        }
+        vector<cl_device_id> deviceIDList;
+        for(const Device& device: deviceList) {
+            deviceIDList.push_back(device());
+        }
+        cl_int error = ::clCompileProgram(
+            object_,
+            static_cast<cl_uint>(deviceList.size()),
+            reinterpret_cast<const cl_device_id*>(deviceIDList.data()),
+            options,
+            static_cast<cl_uint>(inputHeaders.size()),
+            reinterpret_cast<const cl_program*>(inputHeaders.data()),
+            reinterpret_cast<const char**>(headerIncludeNamesCStr.data()),
             notifyFptr,
             data);
         return detail::buildErrHandler(error, __COMPILE_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
@@ -6360,14 +6840,14 @@ class Program : public detail::Wrapper<cl_program>
             __GET_PROGRAM_INFO_ERR);
     }
 
-    template <cl_int name> typename
+    template <cl_program_info name> typename
     detail::param_traits<detail::cl_program_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
+    getInfo(cl_int* err = nullptr) const
     {
         typename detail::param_traits<
             detail::cl_program_info, name>::param_type param;
         cl_int result = getInfo(name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
@@ -6383,14 +6863,14 @@ class Program : public detail::Wrapper<cl_program>
                 __GET_PROGRAM_BUILD_INFO_ERR);
     }
 
-    template <cl_int name> typename
+    template <cl_program_build_info name> typename
     detail::param_traits<detail::cl_program_build_info, name>::param_type
-    getBuildInfo(const Device& device, cl_int* err = NULL) const
+    getBuildInfo(const Device& device, cl_int* err = nullptr) const
     {
         typename detail::param_traits<
             detail::cl_program_build_info, name>::param_type param;
         cl_int result = getBuildInfo(device, name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
@@ -6401,9 +6881,9 @@ class Program : public detail::Wrapper<cl_program>
      * info type and for all devices in the program.
      * On an error reading the info for any device, an empty vector of info will be returned.
      */
-    template <cl_int name>
+    template <cl_program_build_info name>
     vector<std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info, name>::param_type>>
-        getBuildInfo(cl_int *err = NULL) const
+        getBuildInfo(cl_int *err = nullptr) const
     {
         cl_int result = CL_SUCCESS;
 
@@ -6413,7 +6893,7 @@ class Program : public detail::Wrapper<cl_program>
 
         // If there was an initial error from getInfo return the error
         if (result != CL_SUCCESS) {
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = result;
             }
             return devInfo;
@@ -6431,7 +6911,7 @@ class Program : public detail::Wrapper<cl_program>
                 break;
             }
         }
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         if (result != CL_SUCCESS) {
@@ -6443,7 +6923,7 @@ class Program : public detail::Wrapper<cl_program>
     cl_int createKernels(vector<Kernel>* kernels)
     {
         cl_uint numKernels;
-        cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels);
+        cl_int err = ::clCreateKernelsInProgram(object_, 0, nullptr, &numKernels);
         if (err != CL_SUCCESS) {
             return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
         }
@@ -6451,7 +6931,7 @@ class Program : public detail::Wrapper<cl_program>
         vector<cl_kernel> value(numKernels);
         
         err = ::clCreateKernelsInProgram(
-            object_, numKernels, value.data(), NULL);
+            object_, numKernels, value.data(), nullptr);
         if (err != CL_SUCCESS) {
             return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
         }
@@ -6469,19 +6949,76 @@ class Program : public detail::Wrapper<cl_program>
         }
         return CL_SUCCESS;
     }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 220
+#if defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS)
+    /*! \brief Registers a callback function to be called when destructors for
+     *         program scope global variables are complete and before the
+     *         program is released.
+     *
+     *  Wraps clSetProgramReleaseCallback().
+     *
+     *  Each call to this function registers the specified user callback function
+     *  on a callback stack associated with program. The registered user callback
+     *  functions are called in the reverse order in which they were registered.
+     */
+    CL_API_PREFIX__VERSION_2_2_DEPRECATED cl_int setReleaseCallback(
+        void (CL_CALLBACK * pfn_notify)(cl_program program, void * user_data),
+        void * user_data = nullptr) CL_API_SUFFIX__VERSION_2_2_DEPRECATED
+    {
+        return detail::errHandler(
+            ::clSetProgramReleaseCallback(
+                object_,
+                pfn_notify,
+                user_data),
+            __SET_PROGRAM_RELEASE_CALLBACK_ERR);
+    }
+#endif // #if defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS)
+
+    /*! \brief Sets a SPIR-V specialization constant.
+     *
+     *  Wraps clSetProgramSpecializationConstant().
+     */
+    template <typename T>
+    typename std::enable_if<!std::is_pointer<T>::value, cl_int>::type
+        setSpecializationConstant(cl_uint index, const T &value)
+    {
+        return detail::errHandler(
+            ::clSetProgramSpecializationConstant(
+                object_,
+                index,
+                sizeof(value),
+                &value),
+            __SET_PROGRAM_SPECIALIZATION_CONSTANT_ERR);
+    }
+
+    /*! \brief Sets a SPIR-V specialization constant.
+     *
+     *  Wraps clSetProgramSpecializationConstant().
+     */
+    cl_int setSpecializationConstant(cl_uint index, size_type size, const void* value)
+    {
+        return detail::errHandler(
+            ::clSetProgramSpecializationConstant(
+                object_,
+                index,
+                size,
+                value),
+            __SET_PROGRAM_SPECIALIZATION_CONSTANT_ERR);
+    }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 220
 };
 
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
 inline Program linkProgram(
-    Program input1,
-    Program input2,
-    const char* options = NULL,
-    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
-    void* data = NULL,
-    cl_int* err = NULL) 
+    const Program& input1,
+    const Program& input2,
+    const char* options = nullptr,
+    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
+    void* data = nullptr,
+    cl_int* err = nullptr)
 {
     cl_int error_local = CL_SUCCESS;
-
     cl_program programs[2] = { input1(), input2() };
 
     Context ctx = input1.getInfo<CL_PROGRAM_CONTEXT>(&error_local);
@@ -6492,7 +7029,7 @@ inline Program linkProgram(
     cl_program prog = ::clLinkProgram(
         ctx(),
         0,
-        NULL,
+        nullptr,
         options,
         2,
         programs,
@@ -6501,7 +7038,7 @@ inline Program linkProgram(
         &error_local);
 
     detail::errHandler(error_local,__COMPILE_PROGRAM_ERR);
-    if (err != NULL) {
+    if (err != nullptr) {
         *err = error_local;
     }
 
@@ -6509,44 +7046,63 @@ inline Program linkProgram(
 }
 
 inline Program linkProgram(
-    vector<Program> inputPrograms,
-    const char* options = NULL,
-    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
-    void* data = NULL,
-    cl_int* err = NULL) 
+    const Program& input1,
+    const Program& input2,
+    const string& options,
+    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
+    void* data = nullptr,
+    cl_int* err = nullptr)
+{
+    return linkProgram(input1, input2, options.c_str(), notifyFptr, data, err);
+}
+
+inline Program linkProgram(
+    const vector<Program>& inputPrograms,
+    const char* options = nullptr,
+    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
+    void* data = nullptr,
+    cl_int* err = nullptr)
 {
     cl_int error_local = CL_SUCCESS;
+    Context ctx;
 
-    vector<cl_program> programs(inputPrograms.size());
+    static_assert(sizeof(cl::Program) == sizeof(cl_program),
+        "Size of cl::Program must be equal to size of cl_program");
 
-    for (unsigned int i = 0; i < inputPrograms.size(); i++) {
-        programs[i] = inputPrograms[i]();
-    }
-    
-    Context ctx;
     if(inputPrograms.size() > 0) {
         ctx = inputPrograms[0].getInfo<CL_PROGRAM_CONTEXT>(&error_local);
         if(error_local!=CL_SUCCESS) {
             detail::errHandler(error_local, __LINK_PROGRAM_ERR);
         }
     }
+
     cl_program prog = ::clLinkProgram(
         ctx(),
         0,
-        NULL,
+        nullptr,
         options,
-        (cl_uint)inputPrograms.size(),
-        programs.data(),
+        static_cast<cl_uint>(inputPrograms.size()),
+        reinterpret_cast<const cl_program *>(inputPrograms.data()),
         notifyFptr,
         data,
         &error_local);
 
     detail::errHandler(error_local,__COMPILE_PROGRAM_ERR);
-    if (err != NULL) {
+    if (err != nullptr) {
         *err = error_local;
     }
 
-    return Program(prog, false);
+    return Program(prog);
+}
+
+inline Program linkProgram(
+    const vector<Program>& inputPrograms,
+    const string& options,
+    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
+    void* data = nullptr,
+    cl_int* err = nullptr)
+{
+    return linkProgram(inputPrograms, options.c_str(), notifyFptr, data, err);
 }
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
 
@@ -6584,12 +7140,40 @@ inline vector<vector<unsigned char>> cl::Program::getInfo<CL_PROGRAM_BINARIES>(c
     vector<vector<unsigned char>> binariesVectors;
 
     cl_int result = getInfo(CL_PROGRAM_BINARIES, &binariesVectors);
-    if (err != NULL) {
+    if (err != nullptr) {
         *err = result;
     }
     return binariesVectors;
 }
 
+#if CL_HPP_TARGET_OPENCL_VERSION >= 220
+// Template specialization for clSetProgramSpecializationConstant
+template <>
+inline cl_int cl::Program::setSpecializationConstant(cl_uint index, const bool &value)
+{
+    cl_uchar ucValue = value ? CL_UCHAR_MAX : 0;
+    return detail::errHandler(
+        ::clSetProgramSpecializationConstant(
+            object_,
+            index,
+            sizeof(ucValue),
+            &ucValue),
+        __SET_PROGRAM_SPECIALIZATION_CONSTANT_ERR);
+}
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 220
+
+inline Kernel::Kernel(const Program& program, const string& name, cl_int* err)
+{
+    cl_int error;
+
+    object_ = ::clCreateKernel(program(), name.c_str(), &error);
+    detail::errHandler(error, __CREATE_KERNEL_ERR);
+
+    if (err != nullptr) {
+        *err = error;
+    }
+}
+
 inline Kernel::Kernel(const Program& program, const char* name, cl_int* err)
 {
     cl_int error;
@@ -6597,24 +7181,47 @@ inline Kernel::Kernel(const Program& program, const char* name, cl_int* err)
     object_ = ::clCreateKernel(program(), name, &error);
     detail::errHandler(error, __CREATE_KERNEL_ERR);
 
-    if (err != NULL) {
+    if (err != nullptr) {
         *err = error;
     }
-
 }
 
-enum class QueueProperties : cl_command_queue_properties
+#ifdef cl_khr_external_memory
+enum class ExternalMemoryType : cl_external_memory_handle_type_khr
 {
     None = 0,
-    Profiling = CL_QUEUE_PROFILING_ENABLE,
-    OutOfOrder = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
-};
 
-inline QueueProperties operator|(QueueProperties lhs, QueueProperties rhs)
+    OpaqueFd = CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_FD_KHR,
+    OpaqueWin32 = CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KHR,
+    OpaqueWin32Kmt = CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KMT_KHR,
+
+    D3D11Texture = CL_EXTERNAL_MEMORY_HANDLE_D3D11_TEXTURE_KHR,
+    D3D11TextureKmt = CL_EXTERNAL_MEMORY_HANDLE_D3D11_TEXTURE_KMT_KHR,
+
+    D3D12Heap = CL_EXTERNAL_MEMORY_HANDLE_D3D12_HEAP_KHR,
+    D3D12Resource = CL_EXTERNAL_MEMORY_HANDLE_D3D12_RESOURCE_KHR,
+
+    DmaBuf = CL_EXTERNAL_MEMORY_HANDLE_DMA_BUF_KHR,
+};
+#endif
+
+enum class QueueProperties : cl_command_queue_properties
+{
+    None = 0,
+    Profiling = CL_QUEUE_PROFILING_ENABLE,
+    OutOfOrder = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
+};
+
+inline QueueProperties operator|(QueueProperties lhs, QueueProperties rhs)
 {
     return static_cast<QueueProperties>(static_cast<cl_command_queue_properties>(lhs) | static_cast<cl_command_queue_properties>(rhs));
 }
 
+inline QueueProperties operator&(QueueProperties lhs, QueueProperties rhs)
+{
+    return static_cast<QueueProperties>(static_cast<cl_command_queue_properties>(lhs) & static_cast<cl_command_queue_properties>(rhs));
+}
+
 /*! \class CommandQueue
  * \brief CommandQueue interface for cl_command_queue.
  */
@@ -6666,6 +7273,24 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
         default_ = c;
     }
 
+#ifdef cl_khr_external_memory
+    static std::once_flag ext_memory_initialized_;
+
+    static void initMemoryExtension(const cl::Device& device) 
+    {
+        auto platform = device.getInfo<CL_DEVICE_PLATFORM>()();
+
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueAcquireExternalMemObjectsKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueReleaseExternalMemObjectsKHR);
+
+        if ((pfn_clEnqueueAcquireExternalMemObjectsKHR == nullptr)
+            && (pfn_clEnqueueReleaseExternalMemObjectsKHR == nullptr))
+        {
+            detail::errHandler(CL_INVALID_VALUE, __ENQUEUE_ACQUIRE_EXTERNAL_MEMORY_ERR);
+        }
+    }
+#endif // cl_khr_external_memory
+
 public:
 #ifdef CL_HPP_UNIT_TEST_ENABLE
     /*! \brief Reset the default.
@@ -6686,7 +7311,7 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
      */
    CommandQueue(
         cl_command_queue_properties properties,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
 
@@ -6694,7 +7319,7 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
         detail::errHandler(error, __CREATE_CONTEXT_ERR);
 
         if (error != CL_SUCCESS) {
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
         }
@@ -6727,7 +7352,7 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
                 }
 
                 detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
-                if (err != NULL) {
+                if (err != nullptr) {
                     *err = error;
                 }
             }
@@ -6738,7 +7363,7 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
                     context(), device(), properties, &error);
 
                 detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-                if (err != NULL) {
+                if (err != nullptr) {
                     *err = error;
                 }
             }
@@ -6752,7 +7377,7 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
     */
    CommandQueue(
        QueueProperties properties,
-       cl_int* err = NULL)
+       cl_int* err = nullptr)
    {
        cl_int error;
 
@@ -6760,7 +7385,7 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
        detail::errHandler(error, __CREATE_CONTEXT_ERR);
 
        if (error != CL_SUCCESS) {
-           if (err != NULL) {
+           if (err != nullptr) {
                *err = error;
            }
        }
@@ -6789,7 +7414,7 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
                    context(), device(), queue_properties, &error);
 
                detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
-               if (err != NULL) {
+               if (err != nullptr) {
                    *err = error;
                }
            }
@@ -6800,7 +7425,7 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
                    context(), device(), static_cast<cl_command_queue_properties>(properties), &error);
 
                detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-               if (err != NULL) {
+               if (err != nullptr) {
                    *err = error;
                }
            }
@@ -6816,7 +7441,7 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
     explicit CommandQueue(
         const Context& context,
         cl_command_queue_properties properties = 0,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
         bool useWithProperties;
@@ -6827,7 +7452,7 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
 
         if (error != CL_SUCCESS)
         {
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
             return;
@@ -6858,7 +7483,7 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
             }
 
             detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
         }
@@ -6869,7 +7494,7 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
                 context(), devices[0](), properties, &error);
 
             detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
         }
@@ -6883,7 +7508,7 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
     explicit CommandQueue(
         const Context& context,
         QueueProperties properties,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
         bool useWithProperties;
@@ -6894,7 +7519,7 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
 
         if (error != CL_SUCCESS)
         {
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
             return;
@@ -6920,7 +7545,7 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
                 context(), devices[0](), queue_properties, &error);
 
             detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
         }
@@ -6931,7 +7556,7 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
                 context(), devices[0](), static_cast<cl_command_queue_properties>(properties), &error);
 
             detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
         }
@@ -6946,7 +7571,7 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
         const Context& context,
         const Device& device,
         cl_command_queue_properties properties = 0,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
         bool useWithProperties;
@@ -6971,7 +7596,7 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
                 context(), device(), queue_properties, &error);
 
             detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
         }
@@ -6982,7 +7607,7 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
                 context(), device(), properties, &error);
 
             detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
         }
@@ -6997,7 +7622,7 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
         const Context& context,
         const Device& device,
         QueueProperties properties,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
         bool useWithProperties;
@@ -7022,7 +7647,7 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
                 context(), device(), queue_properties, &error);
 
             detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
         }
@@ -7033,14 +7658,14 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
                 context(), device(), static_cast<cl_command_queue_properties>(properties), &error);
 
             detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-            if (err != NULL) {
+            if (err != nullptr) {
                 *err = error;
             }
         }
 #endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200
     }
 
-    static CommandQueue getDefault(cl_int * err = NULL) 
+    static CommandQueue getDefault(cl_int * err = nullptr) 
     {
         std::call_once(default_initialized_, makeDefault);
 #if CL_HPP_TARGET_OPENCL_VERSION >= 200
@@ -7048,7 +7673,7 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
 #else // CL_HPP_TARGET_OPENCL_VERSION >= 200
         detail::errHandler(default_error_, __CREATE_COMMAND_QUEUE_ERR);
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = default_error_;
         }
         return default_;
@@ -7071,7 +7696,7 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
     CommandQueue() { }
 
 
-    /*! \brief Constructor from cl_mem - takes ownership.
+    /*! \brief Constructor from cl_command_queue - takes ownership.
      *
      * \param retainObject will cause the constructor to retain its cl object.
      *                     Defaults to false to maintain compatibility with
@@ -7086,34 +7711,6 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    CommandQueue(const CommandQueue& queue) : detail::Wrapper<cl_type>(queue) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    CommandQueue& operator = (const CommandQueue &queue)
-    {
-        detail::Wrapper<cl_type>::operator=(queue);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    CommandQueue(CommandQueue&& queue) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(queue)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    CommandQueue& operator = (CommandQueue &&queue)
-    {
-        detail::Wrapper<cl_type>::operator=(std::move(queue));
-        return *this;
-    }
-
     template <typename T>
     cl_int getInfo(cl_command_queue_info name, T* param) const
     {
@@ -7123,14 +7720,14 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
                 __GET_COMMAND_QUEUE_INFO_ERR);
     }
 
-    template <cl_int name> typename
+    template <cl_command_queue_info name> typename
     detail::param_traits<detail::cl_command_queue_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
+    getInfo(cl_int* err = nullptr) const
     {
         typename detail::param_traits<
             detail::cl_command_queue_info, name>::param_type param;
         cl_int result = getInfo(name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
@@ -7142,20 +7739,20 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
         size_type offset,
         size_type size,
         void* ptr,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
             ::clEnqueueReadBuffer(
                 object_, buffer(), blocking, offset, size,
                 ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_READ_BUFFER_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -7167,20 +7764,20 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
         size_type offset,
         size_type size,
         const void* ptr,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
             ::clEnqueueWriteBuffer(
                 object_, buffer(), blocking, offset, size,
                 ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
                 __ENQUEUE_WRITE_BUFFER_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -7192,19 +7789,19 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
         size_type src_offset,
         size_type dst_offset,
         size_type size,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
             ::clEnqueueCopyBuffer(
                 object_, src(), dst(), src_offset, dst_offset, size,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
             __ENQEUE_COPY_BUFFER_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -7221,8 +7818,8 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
         size_type host_row_pitch,
         size_type host_slice_pitch,
         void *ptr,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
@@ -7238,17 +7835,46 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
                 host_row_pitch,
                 host_slice_pitch,
                 ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
                 __ENQUEUE_READ_BUFFER_RECT_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
     }
 
+    cl_int enqueueReadBufferRect(
+        const Buffer& buffer,
+        cl_bool blocking,
+        const array<size_type, 2>& buffer_offset,
+        const array<size_type, 2>& host_offset,
+        const array<size_type, 2>& region,
+        size_type buffer_row_pitch,
+        size_type buffer_slice_pitch,
+        size_type host_row_pitch,
+        size_type host_slice_pitch,
+        void* ptr,
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
+    { 
+        return enqueueReadBufferRect(
+            buffer,
+            blocking,
+            { buffer_offset[0], buffer_offset[1], 0 },
+            { host_offset[0], host_offset[1], 0 },
+            { region[0], region[1], 1 },
+            buffer_row_pitch,
+            buffer_slice_pitch,
+            host_row_pitch,
+            host_slice_pitch,
+            ptr,
+            events,
+            event);
+    }
+
     cl_int enqueueWriteBufferRect(
         const Buffer& buffer,
         cl_bool blocking,
@@ -7260,8 +7886,8 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
         size_type host_row_pitch,
         size_type host_slice_pitch,
         const void *ptr,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
@@ -7277,17 +7903,46 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
                 host_row_pitch,
                 host_slice_pitch,
                 ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
                 __ENQUEUE_WRITE_BUFFER_RECT_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
     }
 
+    cl_int enqueueWriteBufferRect(
+        const Buffer& buffer,
+        cl_bool blocking,
+        const array<size_type, 2>& buffer_offset,
+        const array<size_type, 2>& host_offset,
+        const array<size_type, 2>& region,
+        size_type buffer_row_pitch,
+        size_type buffer_slice_pitch,
+        size_type host_row_pitch,
+        size_type host_slice_pitch,
+        const void* ptr,
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
+    {
+        return enqueueWriteBufferRect(
+            buffer, 
+            blocking,
+            { buffer_offset[0], buffer_offset[1], 0 },
+            { host_offset[0], host_offset[1], 0 },
+            { region[0], region[1], 1 },
+            buffer_row_pitch,
+            buffer_slice_pitch,
+            host_row_pitch,
+            host_slice_pitch,
+            ptr,
+            events,
+            event);
+    }
+
     cl_int enqueueCopyBufferRect(
         const Buffer& src,
         const Buffer& dst,
@@ -7298,8 +7953,8 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
         size_type src_slice_pitch,
         size_type dst_row_pitch,
         size_type dst_slice_pitch,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
@@ -7314,16 +7969,44 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
                 src_slice_pitch,
                 dst_row_pitch,
                 dst_slice_pitch,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
             __ENQEUE_COPY_BUFFER_RECT_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
     }
+
+    cl_int enqueueCopyBufferRect(
+        const Buffer& src,
+        const Buffer& dst,
+        const array<size_type, 2>& src_origin,
+        const array<size_type, 2>& dst_origin,
+        const array<size_type, 2>& region,
+        size_type src_row_pitch,
+        size_type src_slice_pitch,
+        size_type dst_row_pitch,
+        size_type dst_slice_pitch,
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
+    {
+        return enqueueCopyBufferRect(
+            src,
+            dst,
+            { src_origin[0], src_origin[1], 0 },
+            { dst_origin[0], dst_origin[1], 0 },
+            { region[0], region[1], 1 },
+            src_row_pitch,
+            src_slice_pitch,
+            dst_row_pitch,
+            dst_slice_pitch,
+            events,
+            event);
+    }
+
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
     /**
@@ -7343,8 +8026,8 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
         PatternType pattern,
         size_type offset,
         size_type size,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
@@ -7355,12 +8038,12 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
                 sizeof(PatternType), 
                 offset, 
                 size,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
                 __ENQUEUE_FILL_BUFFER_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -7375,8 +8058,8 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
         size_type row_pitch,
         size_type slice_pitch,
         void* ptr,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
@@ -7389,17 +8072,40 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
                 row_pitch, 
                 slice_pitch, 
                 ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_READ_IMAGE_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
     }
 
+    cl_int enqueueReadImage(
+        const Image& image,
+        cl_bool blocking,
+        const array<size_type, 2>& origin,
+        const array<size_type, 2>& region,
+        size_type row_pitch,
+        size_type slice_pitch,
+        void* ptr,
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
+    {
+        return enqueueReadImage(
+            image,
+            blocking,
+            { origin[0], origin[1], 0 },
+            { region[0], region[1], 1 },
+            row_pitch,
+            slice_pitch,
+            ptr,
+            events,
+            event);
+    }
+
     cl_int enqueueWriteImage(
         const Image& image,
         cl_bool blocking,
@@ -7408,8 +8114,8 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
         size_type row_pitch,
         size_type slice_pitch,
         const void* ptr,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
@@ -7422,25 +8128,48 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
                 row_pitch, 
                 slice_pitch, 
                 ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_WRITE_IMAGE_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
     }
 
+    cl_int enqueueWriteImage(
+        const Image& image,
+        cl_bool blocking,
+        const array<size_type, 2>& origin,
+        const array<size_type, 2>& region,
+        size_type row_pitch,
+        size_type slice_pitch,
+        const void* ptr,
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
+    {
+        return enqueueWriteImage(
+            image,
+            blocking,
+            { origin[0], origin[1], 0 },
+            { region[0], region[1], 1 },
+            row_pitch,
+            slice_pitch,
+            ptr,
+            events,
+            event);
+    }
+
     cl_int enqueueCopyImage(
         const Image& src,
         const Image& dst,
         const array<size_type, 3>& src_origin,
         const array<size_type, 3>& dst_origin,
         const array<size_type, 3>& region,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
@@ -7451,118 +8180,102 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
                 src_origin.data(),
                 dst_origin.data(), 
                 region.data(),
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_COPY_IMAGE_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
     }
 
-#if CL_HPP_TARGET_OPENCL_VERSION >= 120
-    /**
-     * Enqueue a command to fill an image object with a specified color.
-     * \param fillColor is the color to use to fill the image.
-     *     This is a four component RGBA floating-point color value if
-     *     the image channel data type is not an unnormalized signed or
-     *     unsigned data type.
-     */
-    cl_int enqueueFillImage(
-        const Image& image,
-        cl_float4 fillColor,
-        const array<size_type, 3>& origin,
-        const array<size_type, 3>& region,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueFillImage(
-                object_, 
-                image(),
-                static_cast<void*>(&fillColor), 
-                origin.data(),
-                region.data(),
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-                __ENQUEUE_FILL_IMAGE_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
+    cl_int enqueueCopyImage(
+        const Image& src,
+        const Image& dst,
+        const array<size_type, 2>& src_origin,
+        const array<size_type, 2>& dst_origin,
+        const array<size_type, 2>& region,
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
+    {
+        return enqueueCopyImage(
+            src,
+            dst,
+            { src_origin[0], src_origin[1], 0 },
+            { dst_origin[0], dst_origin[1], 0 },
+            { region[0], region[1], 1 },
+            events,
+            event);
     }
 
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
     /**
      * Enqueue a command to fill an image object with a specified color.
      * \param fillColor is the color to use to fill the image.
-     *     This is a four component RGBA signed integer color value if
-     *     the image channel data type is an unnormalized signed integer
-     *     type.
+     *     This is a four component RGBA floating-point, signed integer
+     *     or unsigned integer color value if  the image channel data
+     *     type is an unnormalized signed integer type.   
      */
-    cl_int enqueueFillImage(
-        const Image& image,
-        cl_int4 fillColor,
-        const array<size_type, 3>& origin,
-        const array<size_type, 3>& region,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+    template <typename T>
+    typename std::enable_if<std::is_same<T, cl_float4>::value ||
+                            std::is_same<T, cl_int4  >::value ||
+                            std::is_same<T, cl_uint4 >::value,
+                            cl_int>::type 
+     enqueueFillImage(
+         const Image& image, 
+         T fillColor,
+         const array<size_type, 3>& origin,
+         const array<size_type, 3>& region,
+         const vector<Event>* events = nullptr,
+         Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
             ::clEnqueueFillImage(
-                object_, 
+                object_,
                 image(),
-                static_cast<void*>(&fillColor), 
+                static_cast<void*>(&fillColor),
                 origin.data(),
                 region.data(),
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-                __ENQUEUE_FILL_IMAGE_ERR);
+                (events != nullptr) ? (cl_uint)events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+                (event != NULL) ? &tmp : nullptr),
+            __ENQUEUE_FILL_IMAGE_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
+        if (event != nullptr && err == CL_SUCCESS) *event = tmp;
 
         return err;
     }
 
-    /**
+   /**
      * Enqueue a command to fill an image object with a specified color.
      * \param fillColor is the color to use to fill the image.
-     *     This is a four component RGBA unsigned integer color value if
-     *     the image channel data type is an unnormalized unsigned integer
-     *     type.
+     *     This is a four component RGBA floating-point, signed integer
+     *     or unsigned integer color value if  the image channel data
+     *     type is an unnormalized signed integer type.
      */
-    cl_int enqueueFillImage(
+    template <typename T>
+    typename std::enable_if<std::is_same<T, cl_float4>::value ||
+                            std::is_same<T, cl_int4  >::value ||
+                            std::is_same<T, cl_uint4 >::value, cl_int>::type
+    enqueueFillImage(
         const Image& image,
-        cl_uint4 fillColor,
-        const array<size_type, 3>& origin,
-        const array<size_type, 3>& region,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueFillImage(
-                object_, 
-                image(),
-                static_cast<void*>(&fillColor), 
-                origin.data(),
-                region.data(),
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-                __ENQUEUE_FILL_IMAGE_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
+        T fillColor,
+        const array<size_type, 2>& origin,
+        const array<size_type, 2>& region,
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
+    {
+        return enqueueFillImage(
+            image,
+            fillColor,
+            { origin[0], origin[1], 0 },
+            { region[0], region[1], 1 },
+            events,
+            event
+            );
     }
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
 
@@ -7572,8 +8285,8 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
         const array<size_type, 3>& src_origin,
         const array<size_type, 3>& region,
         size_type dst_offset,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
@@ -7584,25 +8297,44 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
                 src_origin.data(),
                 region.data(), 
                 dst_offset,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
     }
 
+    cl_int enqueueCopyImageToBuffer(
+        const Image& src,
+        const Buffer& dst,
+        const array<size_type, 2>& src_origin,
+        const array<size_type, 2>& region,
+        size_type dst_offset,
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
+    { 
+        return enqueueCopyImageToBuffer(
+            src,
+            dst,
+            { src_origin[0], src_origin[1], 0 },
+            { region[0], region[1], 1 },
+            dst_offset,
+            events,
+            event);
+    }
+
     cl_int enqueueCopyBufferToImage(
         const Buffer& src,
         const Image& dst,
         size_type src_offset,
         const array<size_type, 3>& dst_origin,
         const array<size_type, 3>& region,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
@@ -7613,80 +8345,273 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
                 src_offset,
                 dst_origin.data(), 
                 region.data(),
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
     }
 
+    cl_int enqueueCopyBufferToImage(
+        const Buffer& src,
+        const Image& dst,
+        size_type src_offset,
+        const array<size_type, 2>& dst_origin,
+        const array<size_type, 2>& region,
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
+    {
+        return enqueueCopyBufferToImage(
+            src,
+            dst, 
+            src_offset,
+            { dst_origin[0], dst_origin[1], 0 },
+            { region[0], region[1], 1 },
+            events,
+            event);
+    }
+
     void* enqueueMapBuffer(
         const Buffer& buffer,
         cl_bool blocking,
         cl_map_flags flags,
         size_type offset,
         size_type size,
-        const vector<Event>* events = NULL,
-        Event* event = NULL,
-        cl_int* err = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr,
+        cl_int* err = nullptr) const
     {
         cl_event tmp;
         cl_int error;
         void * result = ::clEnqueueMapBuffer(
             object_, buffer(), blocking, flags, offset, size,
-            (events != NULL) ? (cl_uint) events->size() : 0,
-            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-            (event != NULL) ? &tmp : NULL,
+            (events != nullptr) ? (cl_uint) events->size() : 0,
+            (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+            (event != nullptr) ? &tmp : nullptr,
             &error);
 
         detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
-        if (event != NULL && error == CL_SUCCESS)
+        if (event != nullptr && error == CL_SUCCESS)
             *event = tmp;
 
         return result;
     }
 
     void* enqueueMapImage(
-        const Image& buffer,
+        const Image& image,
         cl_bool blocking,
         cl_map_flags flags,
         const array<size_type, 3>& origin,
         const array<size_type, 3>& region,
         size_type * row_pitch,
         size_type * slice_pitch,
-        const vector<Event>* events = NULL,
-        Event* event = NULL,
-        cl_int* err = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr,
+        cl_int* err = nullptr) const
     {
         cl_event tmp;
         cl_int error;
         void * result = ::clEnqueueMapImage(
-            object_, buffer(), blocking, flags,
+            object_, image(), blocking, flags,
             origin.data(), 
             region.data(),
             row_pitch, slice_pitch,
-            (events != NULL) ? (cl_uint) events->size() : 0,
-            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-            (event != NULL) ? &tmp : NULL,
+            (events != nullptr) ? (cl_uint) events->size() : 0,
+            (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+            (event != nullptr) ? &tmp : nullptr,
             &error);
 
         detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
               *err = error;
         }
-        if (event != NULL && error == CL_SUCCESS)
+        if (event != nullptr && error == CL_SUCCESS)
             *event = tmp;
         return result;
     }
 
+    void* enqueueMapImage(
+         const Image& image,
+         cl_bool blocking,
+         cl_map_flags flags,
+         const array<size_type, 2>& origin,
+         const array<size_type, 2>& region,
+         size_type* row_pitch,
+         size_type* slice_pitch,
+         const vector<Event>* events = nullptr,
+         Event* event = nullptr,
+         cl_int* err = nullptr) const
+    {
+        return enqueueMapImage(image, blocking, flags,
+                               { origin[0], origin[1], 0 },
+                               { region[0], region[1], 1 }, row_pitch,
+                               slice_pitch, events, event, err);
+    }
+
 #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+    /**
+    * Enqueues a command that copies a region of memory from the source pointer to the destination pointer.
+    * This function is specifically for transferring data between the host and a coarse-grained SVM buffer.
+    */
+    template<typename T>
+    cl_int enqueueMemcpySVM(
+            T *dst_ptr,
+            const T *src_ptr,
+            cl_bool blocking,
+            size_type size,
+            const vector<Event> *events = nullptr,
+            Event *event = nullptr) const {
+        cl_event tmp;
+        cl_int err = detail::errHandler(::clEnqueueSVMMemcpy(
+                object_, blocking, static_cast<void *>(dst_ptr), static_cast<const void *>(src_ptr), size,
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event *) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr), __ENQUEUE_COPY_SVM_ERR);
+
+        if (event != nullptr && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+    *Enqueues a command that will copy data from one coarse-grained SVM buffer to another.
+    *This function takes two cl::pointer instances representing the destination and source buffers.
+    */
+    template<typename T, class D>
+    cl_int enqueueMemcpySVM(
+            cl::pointer<T, D> &dst_ptr,
+            const cl::pointer<T, D> &src_ptr,
+            cl_bool blocking,
+            size_type size,
+            const vector<Event> *events = nullptr,
+            Event *event = nullptr) const {
+        cl_event tmp;
+        cl_int err = detail::errHandler(::clEnqueueSVMMemcpy(
+                object_, blocking, static_cast<void *>(dst_ptr.get()), static_cast<const void *>(src_ptr.get()),
+                size,
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event *) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr), __ENQUEUE_COPY_SVM_ERR);
+
+        if (event != nullptr && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+    * Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer.
+    * This variant takes a cl::vector instance.
+    */
+    template<typename T, class Alloc>
+    cl_int enqueueMemcpySVM(
+            cl::vector<T, Alloc> &dst_container,
+            const cl::vector<T, Alloc> &src_container,
+            cl_bool blocking,
+            const vector<Event> *events = nullptr,
+            Event *event = nullptr) const {
+        cl_event tmp;
+        if(src_container.size() != dst_container.size()){
+            return detail::errHandler(CL_INVALID_VALUE,__ENQUEUE_COPY_SVM_ERR);
+        }
+        cl_int err = detail::errHandler(::clEnqueueSVMMemcpy(
+                object_, blocking, static_cast<void *>(dst_container.data()),
+                static_cast<const void *>(src_container.data()),
+                dst_container.size() * sizeof(T),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event *) &events->front() : nullptr,
+                (event != NULL) ? &tmp : nullptr), __ENQUEUE_COPY_SVM_ERR);
+
+        if (event != nullptr && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+    * Enqueues a command to fill a SVM buffer with a pattern.
+    *
+    */
+    template<typename T, typename PatternType>
+    cl_int enqueueMemFillSVM(
+            T *ptr,
+            PatternType pattern,
+            size_type size,
+            const vector<Event> *events = nullptr,
+            Event *event = nullptr) const {
+        cl_event tmp;
+        cl_int err = detail::errHandler(::clEnqueueSVMMemFill(
+                object_, static_cast<void *>(ptr), static_cast<void *>(&pattern),
+                sizeof(PatternType), size,
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event *) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr), __ENQUEUE_FILL_SVM_ERR);
+
+        if (event != nullptr && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+    * Enqueues a command that fills a region of a coarse-grained SVM buffer with a specified pattern.
+    * This variant takes a cl::pointer instance.
+    */
+    template<typename T, class D, typename PatternType>
+    cl_int enqueueMemFillSVM(
+            cl::pointer<T, D> &ptr,
+            PatternType pattern,
+            size_type size,
+            const vector<Event> *events = nullptr,
+            Event *event = nullptr) const {
+        cl_event tmp;
+        cl_int err = detail::errHandler(::clEnqueueSVMMemFill(
+                object_, static_cast<void *>(ptr.get()), static_cast<void *>(&pattern),
+                sizeof(PatternType), size,
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event *) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr), __ENQUEUE_FILL_SVM_ERR);
+
+        if (event != nullptr && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+    * Enqueues a command that will allow the host to fill a region of a coarse-grained SVM buffer with a specified pattern.
+    * This variant takes a cl::vector instance.
+    */
+    template<typename T, class Alloc, typename PatternType>
+    cl_int enqueueMemFillSVM(
+            cl::vector<T, Alloc> &container,
+            PatternType pattern,
+            const vector<Event> *events = nullptr,
+            Event* event = nullptr) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(::clEnqueueSVMMemFill(
+                object_, static_cast<void *>(container.data()), static_cast<void *>(&pattern),
+                sizeof(PatternType), container.size() * sizeof(T),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event *) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : NULL), __ENQUEUE_FILL_SVM_ERR);
+
+        if (event != nullptr && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
     /**
      * Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer.
      * This variant takes a raw SVM pointer.
@@ -7697,18 +8622,18 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
         cl_bool blocking,
         cl_map_flags flags,
         size_type size,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(::clEnqueueSVMMap(
             object_, blocking, flags, static_cast<void*>(ptr), size,
-            (events != NULL) ? (cl_uint)events->size() : 0,
-            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-            (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_MAP_BUFFER_ERR);
+            (events != nullptr) ? (cl_uint)events->size() : 0,
+            (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,
+            (event != nullptr) ? &tmp : nullptr),
+            __ENQUEUE_MAP_SVM_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -7725,18 +8650,18 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
         cl_bool blocking,
         cl_map_flags flags,
         size_type size,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(::clEnqueueSVMMap(
             object_, blocking, flags, static_cast<void*>(ptr.get()), size,
-            (events != NULL) ? (cl_uint)events->size() : 0,
-            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-            (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_MAP_BUFFER_ERR);
+            (events != nullptr) ? (cl_uint)events->size() : 0,
+            (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,
+            (event != nullptr) ? &tmp : nullptr),
+            __ENQUEUE_MAP_SVM_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -7751,18 +8676,18 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
         cl::vector<T, Alloc> &container,
         cl_bool blocking,
         cl_map_flags flags,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(::clEnqueueSVMMap(
-            object_, blocking, flags, static_cast<void*>(container.data()), container.size(),
-            (events != NULL) ? (cl_uint)events->size() : 0,
-            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-            (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_MAP_BUFFER_ERR);
+            object_, blocking, flags, static_cast<void*>(container.data()), container.size()*sizeof(T),
+            (events != nullptr) ? (cl_uint)events->size() : 0,
+            (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,
+            (event != nullptr) ? &tmp : nullptr),
+            __ENQUEUE_MAP_SVM_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -7772,19 +8697,19 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
     cl_int enqueueUnmapMemObject(
         const Memory& memory,
         void* mapped_ptr,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
             ::clEnqueueUnmapMemObject(
                 object_, memory(), mapped_ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -7799,19 +8724,19 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
     template<typename T>
     cl_int enqueueUnmapSVM(
         T* ptr,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
             ::clEnqueueSVMUnmap(
             object_, static_cast<void*>(ptr),
-            (events != NULL) ? (cl_uint)events->size() : 0,
-            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-            (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+            (events != nullptr) ? (cl_uint)events->size() : 0,
+            (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,
+            (event != nullptr) ? &tmp : nullptr),
+            __ENQUEUE_UNMAP_SVM_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -7824,19 +8749,19 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
     template<typename T, class D>
     cl_int enqueueUnmapSVM(
         cl::pointer<T, D> &ptr,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
             ::clEnqueueSVMUnmap(
             object_, static_cast<void*>(ptr.get()),
-            (events != NULL) ? (cl_uint)events->size() : 0,
-            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-            (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+            (events != nullptr) ? (cl_uint)events->size() : 0,
+            (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,
+            (event != nullptr) ? &tmp : nullptr),
+            __ENQUEUE_UNMAP_SVM_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -7849,19 +8774,19 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
     template<typename T, class Alloc>
     cl_int enqueueUnmapSVM(
         cl::vector<T, Alloc> &container,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
             ::clEnqueueSVMUnmap(
             object_, static_cast<void*>(container.data()),
-            (events != NULL) ? (cl_uint)events->size() : 0,
-            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-            (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+            (events != nullptr) ? (cl_uint)events->size() : 0,
+            (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,
+            (event != nullptr) ? &tmp : nullptr),
+            __ENQUEUE_UNMAP_SVM_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -7881,19 +8806,19 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
      * have completed.
      */
     cl_int enqueueMarkerWithWaitList(
-        const vector<Event> *events = 0,
-        Event *event = 0) const
+        const vector<Event> *events = nullptr,
+        Event *event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
             ::clEnqueueMarkerWithWaitList(
                 object_,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_MARKER_WAIT_LIST_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -7911,19 +8836,19 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
      * before this command to command_queue, have completed.
      */
     cl_int enqueueBarrierWithWaitList(
-        const vector<Event> *events = 0,
-        Event *event = 0) const
+        const vector<Event> *events = nullptr,
+        Event *event = nullptr) const
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
             ::clEnqueueBarrierWithWaitList(
                 object_,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_BARRIER_WAIT_LIST_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -7936,8 +8861,8 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
     cl_int enqueueMigrateMemObjects(
         const vector<Memory> &memObjects,
         cl_mem_migration_flags flags,
-        const vector<Event>* events = NULL,
-        Event* event = NULL
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr
         ) const
     {
         cl_event tmp;
@@ -7947,68 +8872,189 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
         for( int i = 0; i < (int)memObjects.size(); ++i ) {
             localMemObjects[i] = memObjects[i]();
         }
-
-
+        
         cl_int err = detail::errHandler(
             ::clEnqueueMigrateMemObjects(
                 object_, 
                 (cl_uint)memObjects.size(), 
                 localMemObjects.data(),
                 flags,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
     }
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
 
-    cl_int enqueueNDRangeKernel(
-        const Kernel& kernel,
-        const NDRange& offset,
-        const NDRange& global,
-        const NDRange& local = NullRange,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 210
+    /**
+     * Enqueues a command that will allow the host associate ranges within a set of
+     * SVM allocations with a device.
+     * @param sizes - The length from each pointer to migrate.
+     */
+    template<typename T>
+    cl_int enqueueMigrateSVM(
+        const cl::vector<T*> &svmRawPointers,
+        const cl::vector<size_type> &sizes,
+        cl_mem_migration_flags flags = 0,
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
         cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueNDRangeKernel(
-                object_, kernel(), (cl_uint) global.dimensions(),
-                offset.dimensions() != 0 ? (const size_type*) offset : NULL,
-                (const size_type*) global,
-                local.dimensions() != 0 ? (const size_type*) local : NULL,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_NDRANGE_KERNEL_ERR);
+        cl_int err = detail::errHandler(::clEnqueueSVMMigrateMem(
+            object_,
+            svmRawPointers.size(), static_cast<void**>(svmRawPointers.data()),
+            sizes.data(), // array of sizes not passed
+            flags,
+            (events != nullptr) ? (cl_uint)events->size() : 0,
+            (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,
+            (event != nullptr) ? &tmp : nullptr),
+            __ENQUEUE_MIGRATE_SVM_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
     }
 
-#if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
-    CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int enqueueTask(
-        const Kernel& kernel,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+    /**
+     * Enqueues a command that will allow the host associate a set of SVM allocations with
+     * a device.
+     */
+    template<typename T>
+    cl_int enqueueMigrateSVM(
+        const cl::vector<T*> &svmRawPointers,
+        cl_mem_migration_flags flags = 0,
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueTask(
-                object_, kernel(),
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_TASK_ERR);
+        return enqueueMigrateSVM(svmRawPointers, cl::vector<size_type>(svmRawPointers.size()), flags, events, event);
+    }
 
-        if (event != NULL && err == CL_SUCCESS)
+
+    /**
+     * Enqueues a command that will allow the host associate ranges within a set of
+     * SVM allocations with a device.
+     * @param sizes - The length from each pointer to migrate.
+     */
+    template<typename T, class D>
+    cl_int enqueueMigrateSVM(
+        const cl::vector<cl::pointer<T, D>> &svmPointers,
+        const cl::vector<size_type> &sizes,
+        cl_mem_migration_flags flags = 0,
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
+    {
+        cl::vector<void*> svmRawPointers;
+        svmRawPointers.reserve(svmPointers.size());
+        for (auto p : svmPointers) {
+            svmRawPointers.push_back(static_cast<void*>(p.get()));
+        }
+
+        return enqueueMigrateSVM(svmRawPointers, sizes, flags, events, event);
+    }
+
+
+    /**
+     * Enqueues a command that will allow the host associate a set of SVM allocations with
+     * a device.
+     */
+    template<typename T, class D>
+    cl_int enqueueMigrateSVM(
+        const cl::vector<cl::pointer<T, D>> &svmPointers,
+        cl_mem_migration_flags flags = 0,
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
+    {
+        return enqueueMigrateSVM(svmPointers, cl::vector<size_type>(svmPointers.size()), flags, events, event);
+    }
+
+    /**
+     * Enqueues a command that will allow the host associate ranges within a set of
+     * SVM allocations with a device.
+     * @param sizes - The length from the beginning of each container to migrate.
+     */
+    template<typename T, class Alloc>
+    cl_int enqueueMigrateSVM(
+        const cl::vector<cl::vector<T, Alloc>> &svmContainers,
+        const cl::vector<size_type> &sizes,
+        cl_mem_migration_flags flags = 0,
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
+    {
+        cl::vector<void*> svmRawPointers;
+        svmRawPointers.reserve(svmContainers.size());
+        for (auto p : svmContainers) {
+            svmRawPointers.push_back(static_cast<void*>(p.data()));
+        }
+
+        return enqueueMigrateSVM(svmRawPointers, sizes, flags, events, event);
+    }
+
+    /**
+     * Enqueues a command that will allow the host associate a set of SVM allocations with
+     * a device.
+     */
+    template<typename T, class Alloc>
+    cl_int enqueueMigrateSVM(
+        const cl::vector<cl::vector<T, Alloc>> &svmContainers,
+        cl_mem_migration_flags flags = 0,
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
+    {
+        return enqueueMigrateSVM(svmContainers, cl::vector<size_type>(svmContainers.size()), flags, events, event);
+    }
+
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
+    
+    cl_int enqueueNDRangeKernel(
+        const Kernel& kernel,
+        const NDRange& offset,
+        const NDRange& global,
+        const NDRange& local = NullRange,
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueNDRangeKernel(
+                object_, kernel(), (cl_uint) global.dimensions(),
+                offset.dimensions() != 0 ? (const size_type*) offset : nullptr,
+                (const size_type*) global,
+                local.dimensions() != 0 ? (const size_type*) local : nullptr,
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
+            __ENQUEUE_NDRANGE_KERNEL_ERR);
+
+        if (event != nullptr && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
+    CL_API_PREFIX__VERSION_1_2_DEPRECATED cl_int enqueueTask(
+        const Kernel& kernel,
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const CL_API_SUFFIX__VERSION_1_2_DEPRECATED
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueTask(
+                object_, kernel(),
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
+            __ENQUEUE_TASK_ERR);
+
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -8018,33 +9064,24 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
     cl_int enqueueNativeKernel(
         void (CL_CALLBACK *userFptr)(void *),
         std::pair<void*, size_type> args,
-        const vector<Memory>* mem_objects = NULL,
-        const vector<const void*>* mem_locs = NULL,
-        const vector<Event>* events = NULL,
-        Event* event = NULL) const
+        const vector<Memory>* mem_objects = nullptr,
+        const vector<const void*>* mem_locs = nullptr,
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr) const
     {
-        size_type elements = 0;
-        if (mem_objects != NULL) {
-            elements = mem_objects->size();
-        }
-        vector<cl_mem> mems(elements);
-        for (unsigned int i = 0; i < elements; i++) {
-            mems[i] = ((*mem_objects)[i])();
-        }
-        
         cl_event tmp;
         cl_int err = detail::errHandler(
             ::clEnqueueNativeKernel(
                 object_, userFptr, args.first, args.second,
-                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
-                mems.data(),
-                (mem_locs != NULL && mem_locs->size() > 0) ? (const void **) &mem_locs->front() : NULL,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (mem_objects != nullptr) ? (cl_uint) mem_objects->size() : 0,
+                (mem_objects->size() > 0 ) ? reinterpret_cast<const cl_mem *>(mem_objects->data()) : nullptr,
+                (mem_locs != nullptr && mem_locs->size() > 0) ? (const void **) &mem_locs->front() : nullptr,
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_NATIVE_KERNEL);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -8054,73 +9091,73 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
  * Deprecated APIs for 1.2
  */
 #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED 
-    cl_int enqueueMarker(Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    CL_API_PREFIX__VERSION_1_1_DEPRECATED 
+    cl_int enqueueMarker(Event* event = nullptr) const CL_API_SUFFIX__VERSION_1_1_DEPRECATED
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
             ::clEnqueueMarker(
                 object_, 
-                (event != NULL) ? &tmp : NULL),
+                (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_MARKER_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
     }
 
-    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
-    cl_int enqueueWaitForEvents(const vector<Event>& events) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    CL_API_PREFIX__VERSION_1_1_DEPRECATED
+    cl_int enqueueWaitForEvents(const vector<Event>& events) const CL_API_SUFFIX__VERSION_1_1_DEPRECATED
     {
         return detail::errHandler(
             ::clEnqueueWaitForEvents(
                 object_,
                 (cl_uint) events.size(),
-                events.size() > 0 ? (const cl_event*) &events.front() : NULL),
+                events.size() > 0 ? (const cl_event*) &events.front() : nullptr),
             __ENQUEUE_WAIT_FOR_EVENTS_ERR);
     }
 #endif // defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
 
     cl_int enqueueAcquireGLObjects(
-         const vector<Memory>* mem_objects = NULL,
-         const vector<Event>* events = NULL,
-         Event* event = NULL) const
+         const vector<Memory>* mem_objects = nullptr,
+         const vector<Event>* events = nullptr,
+         Event* event = nullptr) const
      {
         cl_event tmp;
         cl_int err = detail::errHandler(
              ::clEnqueueAcquireGLObjects(
                  object_,
-                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
-                 (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
-                 (events != NULL) ? (cl_uint) events->size() : 0,
-                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                 (event != NULL) ? &tmp : NULL),
+                 (mem_objects != nullptr) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != nullptr && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): nullptr,
+                 (events != nullptr) ? (cl_uint) events->size() : 0,
+                 (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                 (event != nullptr) ? &tmp : nullptr),
              __ENQUEUE_ACQUIRE_GL_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
      }
 
     cl_int enqueueReleaseGLObjects(
-         const vector<Memory>* mem_objects = NULL,
-         const vector<Event>* events = NULL,
-         Event* event = NULL) const
+         const vector<Memory>* mem_objects = nullptr,
+         const vector<Event>* events = nullptr,
+         Event* event = nullptr) const
      {
         cl_event tmp;
         cl_int err = detail::errHandler(
              ::clEnqueueReleaseGLObjects(
                  object_,
-                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
-                 (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
-                 (events != NULL) ? (cl_uint) events->size() : 0,
-                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                 (event != NULL) ? &tmp : NULL),
+                 (mem_objects != nullptr) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != nullptr && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): nullptr,
+                 (events != nullptr) ? (cl_uint) events->size() : 0,
+                 (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                 (event != nullptr) ? &tmp : nullptr),
              __ENQUEUE_RELEASE_GL_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -8137,18 +9174,18 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
     const cl_event* event_wait_list, cl_event* event);
 
     cl_int enqueueAcquireD3D10Objects(
-         const vector<Memory>* mem_objects = NULL,
-         const vector<Event>* events = NULL,
-         Event* event = NULL) const
+         const vector<Memory>* mem_objects = nullptr,
+         const vector<Event>* events = nullptr,
+         Event* event = nullptr) const
     {
-        static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL;
+        static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = nullptr;
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
         cl_context context = getInfo<CL_QUEUE_CONTEXT>();
         cl::Device device(getInfo<CL_QUEUE_DEVICE>());
         cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
         CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueAcquireD3D10ObjectsKHR);
 #endif
-#if CL_HPP_TARGET_OPENCL_VERSION >= 110
+#if CL_HPP_MINIMUM_OPENCL_VERSION < 120
         CL_HPP_INIT_CL_EXT_FCN_PTR_(clEnqueueAcquireD3D10ObjectsKHR);
 #endif
         
@@ -8156,47 +9193,47 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
         cl_int err = detail::errHandler(
              pfn_clEnqueueAcquireD3D10ObjectsKHR(
                  object_,
-                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
-                 (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
-                 (events != NULL) ? (cl_uint) events->size() : 0,
-                 (events != NULL) ? (cl_event*) &events->front() : NULL,
-                 (event != NULL) ? &tmp : NULL),
+                 (mem_objects != nullptr) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != nullptr && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): nullptr,
+                 (events != nullptr) ? (cl_uint) events->size() : 0,
+                 (events != nullptr) ? (cl_event*) &events->front() : nullptr,
+                 (event != nullptr) ? &tmp : nullptr),
              __ENQUEUE_ACQUIRE_GL_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
      }
 
     cl_int enqueueReleaseD3D10Objects(
-         const vector<Memory>* mem_objects = NULL,
-         const vector<Event>* events = NULL,
-         Event* event = NULL) const
+         const vector<Memory>* mem_objects = nullptr,
+         const vector<Event>* events = nullptr,
+         Event* event = nullptr) const
     {
-        static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL;
+        static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = nullptr;
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
         cl_context context = getInfo<CL_QUEUE_CONTEXT>();
         cl::Device device(getInfo<CL_QUEUE_DEVICE>());
         cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
         CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueReleaseD3D10ObjectsKHR);
-#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
-#if CL_HPP_TARGET_OPENCL_VERSION >= 110
+#endif
+#if CL_HPP_MINIMUM_OPENCL_VERSION < 120
         CL_HPP_INIT_CL_EXT_FCN_PTR_(clEnqueueReleaseD3D10ObjectsKHR);
-#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+#endif
 
         cl_event tmp;
         cl_int err = detail::errHandler(
             pfn_clEnqueueReleaseD3D10ObjectsKHR(
                 object_,
-                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
-                (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
+                (mem_objects != nullptr) ? (cl_uint) mem_objects->size() : 0,
+                (mem_objects != nullptr && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): nullptr,
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_RELEASE_GL_ERR);
 
-        if (event != NULL && err == CL_SUCCESS)
+        if (event != nullptr && err == CL_SUCCESS)
             *event = tmp;
 
         return err;
@@ -8207,8 +9244,8 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
  * Deprecated APIs for 1.2
  */
 #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
-    cl_int enqueueBarrier() const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    CL_API_PREFIX__VERSION_1_1_DEPRECATED
+    cl_int enqueueBarrier() const CL_API_SUFFIX__VERSION_1_1_DEPRECATED
     {
         return detail::errHandler(
             ::clEnqueueBarrier(object_),
@@ -8225,8 +9262,86 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
     {
         return detail::errHandler(::clFinish(object_), __FINISH_ERR);
     }
+
+#ifdef cl_khr_external_memory
+    cl_int enqueueAcquireExternalMemObjects(
+        const vector<Memory>& mem_objects,
+        const vector<Event>* events_wait = nullptr,
+        Event *event = nullptr)
+    {
+        cl_int err = CL_INVALID_OPERATION;
+        cl_event tmp;
+
+        std::call_once(ext_memory_initialized_, initMemoryExtension, this->getInfo<CL_QUEUE_DEVICE>());
+
+        if (pfn_clEnqueueAcquireExternalMemObjectsKHR)
+        {
+            err = pfn_clEnqueueAcquireExternalMemObjectsKHR(
+                object_,
+                static_cast<cl_uint>(mem_objects.size()),
+                (mem_objects.size() > 0) ? reinterpret_cast<const cl_mem *>(mem_objects.data()) : nullptr,
+                (events_wait != nullptr) ? static_cast<cl_uint>(events_wait->size()) : 0,
+                (events_wait != nullptr && events_wait->size() > 0) ? reinterpret_cast<const cl_event*>(events_wait->data()) : nullptr,
+                &tmp);
+        }
+
+        detail::errHandler(err, __ENQUEUE_ACQUIRE_EXTERNAL_MEMORY_ERR);
+
+        if (event != nullptr && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueReleaseExternalMemObjects(
+        const vector<Memory>& mem_objects,
+        const vector<Event>* events_wait = nullptr,
+        Event *event = nullptr)
+    {
+        cl_int err = CL_INVALID_OPERATION;
+        cl_event tmp;
+
+        std::call_once(ext_memory_initialized_, initMemoryExtension, this->getInfo<CL_QUEUE_DEVICE>());
+
+        if (pfn_clEnqueueReleaseExternalMemObjectsKHR)
+        {
+            err = pfn_clEnqueueReleaseExternalMemObjectsKHR(
+                object_,
+                static_cast<cl_uint>(mem_objects.size()),
+                (mem_objects.size() > 0) ? reinterpret_cast<const cl_mem *>(mem_objects.data()) : nullptr,
+                (events_wait != nullptr) ? static_cast<cl_uint>(events_wait->size()) : 0,
+                (events_wait != nullptr && events_wait->size() > 0) ? reinterpret_cast<const cl_event*>(events_wait->data()) : nullptr,
+                &tmp);
+        }
+
+        detail::errHandler(err, __ENQUEUE_RELEASE_EXTERNAL_MEMORY_ERR);
+
+        if (event != nullptr && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // cl_khr_external_memory && CL_HPP_TARGET_OPENCL_VERSION >= 300
+
+#ifdef cl_khr_semaphore
+    cl_int enqueueWaitSemaphores(
+        const vector<Semaphore> &sema_objects,
+        const vector<cl_semaphore_payload_khr> &sema_payloads = {},
+        const vector<Event>* events_wait_list = nullptr,
+        Event *event = nullptr) const;
+
+    cl_int enqueueSignalSemaphores(
+        const vector<Semaphore> &sema_objects,
+        const vector<cl_semaphore_payload_khr>& sema_payloads = {},
+        const vector<Event>* events_wait_list = nullptr,
+        Event* event = nullptr);
+#endif // cl_khr_semaphore
 }; // CommandQueue
 
+#ifdef cl_khr_external_memory
+CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag CommandQueue::ext_memory_initialized_;
+#endif
+
 CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag CommandQueue::default_initialized_;
 CL_HPP_DEFINE_STATIC_MEMBER_ CommandQueue CommandQueue::default_;
 CL_HPP_DEFINE_STATIC_MEMBER_ cl_int CommandQueue::default_error_ = CL_SUCCESS;
@@ -8259,7 +9374,7 @@ class DeviceCommandQueue : public detail::Wrapper<cl_command_queue>
     /*!
      * Default construct device command queue on default context and device
      */
-    DeviceCommandQueue(DeviceQueueProperties properties, cl_int* err = NULL)
+    DeviceCommandQueue(DeviceQueueProperties properties, cl_int* err = nullptr)
     {
         cl_int error;
         cl::Context context = cl::Context::getDefault();
@@ -8274,7 +9389,7 @@ class DeviceCommandQueue : public detail::Wrapper<cl_command_queue>
             context(), device(), queue_properties, &error);
 
         detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -8286,7 +9401,7 @@ class DeviceCommandQueue : public detail::Wrapper<cl_command_queue>
         const Context& context,
         const Device& device,
         DeviceQueueProperties properties = DeviceQueueProperties::None,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
 
@@ -8298,7 +9413,7 @@ class DeviceCommandQueue : public detail::Wrapper<cl_command_queue>
             context(), device(), queue_properties, &error);
 
         detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -8311,7 +9426,7 @@ class DeviceCommandQueue : public detail::Wrapper<cl_command_queue>
         const Device& device,
         cl_uint queueSize,
         DeviceQueueProperties properties = DeviceQueueProperties::None,
-        cl_int* err = NULL)
+        cl_int* err = nullptr)
     {
         cl_int error;
 
@@ -8325,7 +9440,7 @@ class DeviceCommandQueue : public detail::Wrapper<cl_command_queue>
             context(), device(), queue_properties, &error);
 
         detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -8345,34 +9460,6 @@ class DeviceCommandQueue : public detail::Wrapper<cl_command_queue>
         return *this;
     }
 
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    DeviceCommandQueue(const DeviceCommandQueue& queue) : detail::Wrapper<cl_type>(queue) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    DeviceCommandQueue& operator = (const DeviceCommandQueue &queue)
-    {
-        detail::Wrapper<cl_type>::operator=(queue);
-        return *this;
-    }
-
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    DeviceCommandQueue(DeviceCommandQueue&& queue) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(queue)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    DeviceCommandQueue& operator = (DeviceCommandQueue &&queue)
-    {
-        detail::Wrapper<cl_type>::operator=(std::move(queue));
-        return *this;
-    }
-
     template <typename T>
     cl_int getInfo(cl_command_queue_info name, T* param) const
     {
@@ -8382,25 +9469,25 @@ class DeviceCommandQueue : public detail::Wrapper<cl_command_queue>
             __GET_COMMAND_QUEUE_INFO_ERR);
     }
 
-    template <cl_int name> typename
+    template <cl_command_queue_info name> typename
         detail::param_traits<detail::cl_command_queue_info, name>::param_type
-        getInfo(cl_int* err = NULL) const
+        getInfo(cl_int* err = nullptr) const
     {
         typename detail::param_traits<
             detail::cl_command_queue_info, name>::param_type param;
         cl_int result = getInfo(name, &param);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = result;
         }
         return param;
     }
 
     /*!
-    * Create a new default device command queue for the default device,
-    * in the default context and of the default size.
-    * If there is already a default queue for the specified device this
-    * function will return the pre-existing queue.
-    */
+     * Create a new default device command queue for the default device,
+     * in the default context and of the default size.
+     * If there is already a default queue for the specified device this
+     * function will return the pre-existing queue.
+     */
     static DeviceCommandQueue makeDefault(
         cl_int *err = nullptr)
     {
@@ -8418,7 +9505,7 @@ class DeviceCommandQueue : public detail::Wrapper<cl_command_queue>
             context(), device(), queue_properties, &error));
 
         detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
 
@@ -8426,11 +9513,11 @@ class DeviceCommandQueue : public detail::Wrapper<cl_command_queue>
     }
 
     /*!
-    * Create a new default device command queue for the specified device
-    * and of the default size.
-    * If there is already a default queue for the specified device this
-    * function will return the pre-existing queue.
-    */
+     * Create a new default device command queue for the specified device
+     * and of the default size.
+     * If there is already a default queue for the specified device this
+     * function will return the pre-existing queue.
+     */
     static DeviceCommandQueue makeDefault(
         const Context &context, const Device &device, cl_int *err = nullptr)
     {
@@ -8446,7 +9533,7 @@ class DeviceCommandQueue : public detail::Wrapper<cl_command_queue>
             context(), device(), queue_properties, &error));
 
         detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
 
@@ -8475,12 +9562,43 @@ class DeviceCommandQueue : public detail::Wrapper<cl_command_queue>
                 context(), device(), queue_properties, &error));
 
         detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
 
         return deviceQueue;
     }
+
+
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 210
+    /*!
+     * Modify the default device command queue to be used for subsequent kernels.
+     * This can update the default command queue for a device repeatedly to account
+     * for kernels that rely on the default.
+     * @return updated default device command queue.
+     */
+    static DeviceCommandQueue updateDefault(const Context &context, const Device &device, const DeviceCommandQueue &default_queue, cl_int *err = nullptr)
+    {
+        cl_int error;
+        error = clSetDefaultDeviceCommandQueue(context.get(), device.get(), default_queue.get());
+
+        detail::errHandler(error, __SET_DEFAULT_DEVICE_COMMAND_QUEUE_ERR);
+        if (err != nullptr) {
+            *err = error;
+        }
+        return default_queue;
+    }
+
+    /*!
+     * Return the current default command queue for the specified command queue
+     */
+    static DeviceCommandQueue getDefault(const CommandQueue &queue, cl_int * err = nullptr)
+    {
+        return queue.getInfo<CL_QUEUE_DEVICE_DEFAULT>(err);
+    }
+
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
 }; // DeviceCommandQueue
 
 namespace detail
@@ -8523,26 +9641,26 @@ Buffer::Buffer(
     size_type size = sizeof(DataType)*(endIterator - startIterator);
 
     if( useHostPtr ) {
-        object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+        object_ = ::clCreateBuffer(context(), flags, size, const_cast<DataType*>(&*startIterator), &error);
     } else {
         object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
     }
 
     detail::errHandler(error, __CREATE_BUFFER_ERR);
-    if (err != NULL) {
+    if (err != nullptr) {
         *err = error;
     }
 
     if( !useHostPtr ) {
         CommandQueue queue(context, 0, &error);
         detail::errHandler(error, __CREATE_BUFFER_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
 
         error = cl::copy(queue, startIterator, endIterator, *this);
         detail::errHandler(error, __CREATE_BUFFER_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -8576,21 +9694,21 @@ Buffer::Buffer(
     Context context = queue.getInfo<CL_QUEUE_CONTEXT>();
 
     if (useHostPtr) {
-        object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+        object_ = ::clCreateBuffer(context(), flags, size, const_cast<DataType*>(&*startIterator), &error);
     }
     else {
         object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
     }
 
     detail::errHandler(error, __CREATE_BUFFER_ERR);
-    if (err != NULL) {
+    if (err != nullptr) {
         *err = error;
     }
 
     if (!useHostPtr) {
         error = cl::copy(queue, startIterator, endIterator, *this);
         detail::errHandler(error, __CREATE_BUFFER_ERR);
-        if (err != NULL) {
+        if (err != nullptr) {
             *err = error;
         }
     }
@@ -8602,8 +9720,8 @@ inline cl_int enqueueReadBuffer(
     size_type offset,
     size_type size,
     void* ptr,
-    const vector<Event>* events = NULL,
-    Event* event = NULL)
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
@@ -8621,8 +9739,8 @@ inline cl_int enqueueWriteBuffer(
         size_type offset,
         size_type size,
         const void* ptr,
-        const vector<Event>* events = NULL,
-        Event* event = NULL)
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
@@ -8640,26 +9758,26 @@ inline void* enqueueMapBuffer(
         cl_map_flags flags,
         size_type offset,
         size_type size,
-        const vector<Event>* events = NULL,
-        Event* event = NULL,
-        cl_int* err = NULL)
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr,
+        cl_int* err = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
     detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
-    if (err != NULL) {
+    if (err != nullptr) {
         *err = error;
     }
 
     void * result = ::clEnqueueMapBuffer(
             queue(), buffer(), blocking, flags, offset, size,
-            (events != NULL) ? (cl_uint) events->size() : 0,
-            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (events != nullptr) ? (cl_uint) events->size() : 0,
+            (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
             (cl_event*) event,
             &error);
 
     detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
-    if (err != NULL) {
+    if (err != nullptr) {
         *err = error;
     }
     return result;
@@ -8684,7 +9802,7 @@ inline cl_int enqueueMapSVM(
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
     if (error != CL_SUCCESS) {
-        return detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+        return detail::errHandler(error, __ENQUEUE_MAP_SVM_ERR);
     }
 
     return queue.enqueueMapSVM(
@@ -8698,12 +9816,12 @@ inline cl_int enqueueMapSVM(
  */
 template<typename T, class D>
 inline cl_int enqueueMapSVM(
-    cl::pointer<T, D> ptr,
+    cl::pointer<T, D> &ptr,
     cl_bool blocking,
     cl_map_flags flags,
     size_type size,
-    const vector<Event>* events = NULL,
-    Event* event = NULL)
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
@@ -8722,16 +9840,16 @@ inline cl_int enqueueMapSVM(
  */
 template<typename T, class Alloc>
 inline cl_int enqueueMapSVM(
-    cl::vector<T, Alloc> container,
+    cl::vector<T, Alloc> &container,
     cl_bool blocking,
     cl_map_flags flags,
-    const vector<Event>* events = NULL,
-    Event* event = NULL)
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
     if (error != CL_SUCCESS) {
-        return detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+        return detail::errHandler(error, __ENQUEUE_MAP_SVM_ERR);
     }
 
     return queue.enqueueMapSVM(
@@ -8743,8 +9861,8 @@ inline cl_int enqueueMapSVM(
 inline cl_int enqueueUnmapMemObject(
     const Memory& memory,
     void* mapped_ptr,
-    const vector<Event>* events = NULL,
-    Event* event = NULL)
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
@@ -8757,12 +9875,12 @@ inline cl_int enqueueUnmapMemObject(
     cl_int err = detail::errHandler(
         ::clEnqueueUnmapMemObject(
         queue(), memory(), mapped_ptr,
-        (events != NULL) ? (cl_uint)events->size() : 0,
-        (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
-        (event != NULL) ? &tmp : NULL),
+        (events != nullptr) ? (cl_uint)events->size() : 0,
+        (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,
+        (event != nullptr) ? &tmp : nullptr),
         __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
 
-    if (event != NULL && err == CL_SUCCESS)
+    if (event != nullptr && err == CL_SUCCESS)
         *event = tmp;
 
     return err;
@@ -8777,17 +9895,17 @@ inline cl_int enqueueUnmapMemObject(
 template<typename T>
 inline cl_int enqueueUnmapSVM(
     T* ptr,
-    const vector<Event>* events = NULL,
-    Event* event = NULL)
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
     if (error != CL_SUCCESS) {
-        return detail::errHandler(error, __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+        return detail::errHandler(error, __ENQUEUE_UNMAP_SVM_ERR);
     }
 
     return detail::errHandler(queue.enqueueUnmapSVM(ptr, events, event), 
-        __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+        __ENQUEUE_UNMAP_SVM_ERR);
 
 }
 
@@ -8799,17 +9917,17 @@ inline cl_int enqueueUnmapSVM(
 template<typename T, class D>
 inline cl_int enqueueUnmapSVM(
     cl::pointer<T, D> &ptr,
-    const vector<Event>* events = NULL,
-    Event* event = NULL)
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
     if (error != CL_SUCCESS) {
-        return detail::errHandler(error, __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+        return detail::errHandler(error, __ENQUEUE_UNMAP_SVM_ERR);
     }
 
     return detail::errHandler(queue.enqueueUnmapSVM(ptr, events, event),
-        __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+        __ENQUEUE_UNMAP_SVM_ERR);
 }
 
 /**
@@ -8820,17 +9938,17 @@ inline cl_int enqueueUnmapSVM(
 template<typename T, class Alloc>
 inline cl_int enqueueUnmapSVM(
     cl::vector<T, Alloc> &container,
-    const vector<Event>* events = NULL,
-    Event* event = NULL)
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
     if (error != CL_SUCCESS) {
-        return detail::errHandler(error, __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+        return detail::errHandler(error, __ENQUEUE_UNMAP_SVM_ERR);
     }
 
     return detail::errHandler(queue.enqueueUnmapSVM(container, events, event),
-        __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+        __ENQUEUE_UNMAP_SVM_ERR);
 }
 
 #endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
@@ -8841,8 +9959,8 @@ inline cl_int enqueueCopyBuffer(
         size_type src_offset,
         size_type dst_offset,
         size_type size,
-        const vector<Event>* events = NULL,
-        Event* event = NULL)
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
@@ -8990,8 +10108,8 @@ inline cl_int enqueueReadBufferRect(
     size_type host_row_pitch,
     size_type host_slice_pitch,
     void *ptr,
-    const vector<Event>* events = NULL,
-    Event* event = NULL)
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
@@ -9015,6 +10133,35 @@ inline cl_int enqueueReadBufferRect(
         event);
 }
 
+inline cl_int enqueueReadBufferRect(
+    const Buffer& buffer, 
+    cl_bool blocking,
+    const array<size_type, 2>& buffer_offset,
+    const array<size_type, 2>& host_offset, 
+    const array<size_type, 2>& region,
+    size_type buffer_row_pitch,
+    size_type buffer_slice_pitch,
+    size_type host_row_pitch,
+    size_type host_slice_pitch,
+    void* ptr,
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
+{
+    return enqueueReadBufferRect(
+        buffer,
+        blocking,
+        { buffer_offset[0], buffer_offset[1], 0 },
+        { host_offset[0], host_offset[1], 0 },
+        { region[0], region[1], 1 },
+        buffer_row_pitch,
+        buffer_slice_pitch,
+        host_row_pitch,
+        host_slice_pitch,
+        ptr,
+        events,
+        event);
+}
+
 inline cl_int enqueueWriteBufferRect(
     const Buffer& buffer,
     cl_bool blocking,
@@ -9026,8 +10173,8 @@ inline cl_int enqueueWriteBufferRect(
     size_type host_row_pitch,
     size_type host_slice_pitch,
     const void *ptr,
-    const vector<Event>* events = NULL,
-    Event* event = NULL)
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
@@ -9051,6 +10198,35 @@ inline cl_int enqueueWriteBufferRect(
         event);
 }
 
+inline cl_int enqueueWriteBufferRect(
+    const Buffer& buffer,
+    cl_bool blocking,
+    const array<size_type, 2>& buffer_offset,
+    const array<size_type, 2>& host_offset,
+    const array<size_type, 2>& region,
+    size_type buffer_row_pitch,
+    size_type buffer_slice_pitch,
+    size_type host_row_pitch,
+    size_type host_slice_pitch,
+    const void* ptr,
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
+{
+    return enqueueWriteBufferRect(
+        buffer, 
+        blocking,
+        { buffer_offset[0], buffer_offset[1], 0 },
+        { host_offset[0], host_offset[1], 0 },
+        { region[0], region[1], 1 }, 
+        buffer_row_pitch,
+        buffer_slice_pitch,
+        host_row_pitch,
+        host_slice_pitch,
+        ptr,
+        events,
+        event);
+}
+
 inline cl_int enqueueCopyBufferRect(
     const Buffer& src,
     const Buffer& dst,
@@ -9061,8 +10237,8 @@ inline cl_int enqueueCopyBufferRect(
     size_type src_slice_pitch,
     size_type dst_row_pitch,
     size_type dst_slice_pitch,
-    const vector<Event>* events = NULL,
-    Event* event = NULL)
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
@@ -9084,6 +10260,33 @@ inline cl_int enqueueCopyBufferRect(
         events, 
         event);
 }
+
+inline cl_int enqueueCopyBufferRect(
+    const Buffer& src,
+    const Buffer& dst,
+    const array<size_type, 2>& src_origin,
+    const array<size_type, 2>& dst_origin,
+    const array<size_type, 2>& region,
+    size_type src_row_pitch,
+    size_type src_slice_pitch,
+    size_type dst_row_pitch,
+    size_type dst_slice_pitch,
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
+{
+    return enqueueCopyBufferRect(
+        src,
+        dst, 
+        { src_origin[0], src_origin[1], 0 },
+        { dst_origin[0], dst_origin[1], 0 },
+        { region[0], region[1], 1 }, 
+        src_row_pitch,
+        src_slice_pitch,
+        dst_row_pitch,
+        dst_slice_pitch,
+        events,
+        event);
+}
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
 
 inline cl_int enqueueReadImage(
@@ -9094,8 +10297,8 @@ inline cl_int enqueueReadImage(
     size_type row_pitch,
     size_type slice_pitch,
     void* ptr,
-    const vector<Event>* events = NULL,
-    Event* event = NULL) 
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr) 
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
@@ -9116,6 +10319,29 @@ inline cl_int enqueueReadImage(
         event);
 }
 
+inline cl_int enqueueReadImage(
+    const Image& image, 
+    cl_bool blocking,
+    const array<size_type, 2>& origin,
+    const array<size_type, 2>& region,
+    size_type row_pitch,
+    size_type slice_pitch,
+    void* ptr, 
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
+{
+    return enqueueReadImage(
+        image,
+        blocking, 
+        { origin[0], origin[1], 0 },
+        { region[0], region[1], 1 },
+        row_pitch,
+        slice_pitch,
+        ptr,
+        events,
+        event);
+}
+
 inline cl_int enqueueWriteImage(
     const Image& image,
     cl_bool blocking,
@@ -9124,8 +10350,8 @@ inline cl_int enqueueWriteImage(
     size_type row_pitch,
     size_type slice_pitch,
     const void* ptr,
-    const vector<Event>* events = NULL,
-    Event* event = NULL)
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
@@ -9146,14 +10372,37 @@ inline cl_int enqueueWriteImage(
         event);
 }
 
+inline cl_int enqueueWriteImage(
+    const Image& image, 
+    cl_bool blocking,
+    const array<size_type, 2>& origin,
+    const array<size_type, 2>& region,
+    size_type row_pitch, 
+    size_type slice_pitch,
+    const void* ptr,
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
+{
+    return enqueueWriteImage(
+        image, 
+        blocking, 
+        { origin[0], origin[1], 0 },
+        { region[0], region[1], 1 }, 
+        row_pitch,
+        slice_pitch,
+        ptr,
+        events,
+        event);    
+}
+
 inline cl_int enqueueCopyImage(
     const Image& src,
     const Image& dst,
     const array<size_type, 3>& src_origin,
     const array<size_type, 3>& dst_origin,
     const array<size_type, 3>& region,
-    const vector<Event>* events = NULL,
-    Event* event = NULL)
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
@@ -9172,14 +10421,33 @@ inline cl_int enqueueCopyImage(
         event);
 }
 
+inline cl_int enqueueCopyImage(
+    const Image& src, 
+    const Image& dst,
+    const array<size_type, 2>& src_origin,
+    const array<size_type, 2>& dst_origin,
+    const array<size_type, 2>& region,
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
+{
+    return enqueueCopyImage(
+        src, 
+        dst,
+        { src_origin[0], src_origin[1], 0 },
+        { dst_origin[0], dst_origin[1], 0 },
+        { region[0], region[1], 1 },
+        events,
+        event);
+}
+
 inline cl_int enqueueCopyImageToBuffer(
     const Image& src,
     const Buffer& dst,
     const array<size_type, 3>& src_origin,
     const array<size_type, 3>& region,
     size_type dst_offset,
-    const vector<Event>* events = NULL,
-    Event* event = NULL)
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
 {
     cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
@@ -9198,16 +10466,35 @@ inline cl_int enqueueCopyImageToBuffer(
         event);
 }
 
-inline cl_int enqueueCopyBufferToImage(
-    const Buffer& src,
-    const Image& dst,
-    size_type src_offset,
-    const array<size_type, 3>& dst_origin,
-    const array<size_type, 3>& region,
-    const vector<Event>* events = NULL,
-    Event* event = NULL)
-{
-    cl_int error;
+inline cl_int enqueueCopyImageToBuffer(
+    const Image& src, 
+    const Buffer& dst,
+    const array<size_type, 2>& src_origin,
+    const array<size_type, 2>& region,
+    size_type dst_offset,
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
+{
+    return enqueueCopyImageToBuffer(
+        src,
+        dst,
+        { src_origin[0], src_origin[1], 0 },
+        { region[0], region[1], 1 },
+        dst_offset,
+        events,
+        event);
+}
+
+inline cl_int enqueueCopyBufferToImage(
+    const Buffer& src,
+    const Image& dst,
+    size_type src_offset,
+    const array<size_type, 3>& dst_origin,
+    const array<size_type, 3>& region,
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
+{
+    cl_int error;
     CommandQueue queue = CommandQueue::getDefault(&error);
 
     if (error != CL_SUCCESS) {
@@ -9224,6 +10511,31 @@ inline cl_int enqueueCopyBufferToImage(
         event);
 }
 
+inline cl_int enqueueCopyBufferToImage(
+    const Buffer& src,
+    const Image& dst,
+    size_type src_offset,
+    const array<size_type, 2>& dst_origin,
+    const array<size_type, 2>& region,
+    const vector<Event>* events = nullptr,
+    Event* event = nullptr)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return enqueueCopyBufferToImage(
+        src,
+        dst,
+        src_offset,
+        { dst_origin[0], dst_origin[1], 0 },
+        { region[0], region[1], 1 },
+        events,
+        event);
+}
 
 inline cl_int flush(void)
 {
@@ -9472,7 +10784,7 @@ class KernelFunctor
     KernelFunctor(
         const Program& program,
         const string name,
-        cl_int * err = NULL) :
+        cl_int * err = nullptr) :
         kernel_(program, name.c_str(), err)
     {}
 
@@ -9548,7 +10860,7 @@ class KernelFunctor
 
 namespace compatibility {
     /**
-     * Backward compatibility class to ensure that cl.hpp code works with cl2.hpp.
+     * Backward compatibility class to ensure that cl.hpp code works with opencl.hpp.
      * Please use KernelFunctor directly.
      */
     template<typename... Ts>
@@ -9561,7 +10873,7 @@ namespace compatibility {
         make_kernel(
             const Program& program,
             const string name,
-            cl_int * err = NULL) :
+            cl_int * err = nullptr) :
             functor_(FunctorType(program, name, err))
         {}
 
@@ -9588,90 +10900,937 @@ namespace compatibility {
     };
 } // namespace compatibility
 
+#ifdef cl_khr_semaphore
+
+#ifdef cl_khr_external_semaphore
+enum ExternalSemaphoreType : cl_external_semaphore_handle_type_khr
+{
+    None = 0,
+#ifdef cl_khr_external_semaphore_dx_fence
+    D3D12Fence = CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR,
+#endif
+#ifdef cl_khr_external_semaphore_opaque_fd
+    OpaqueFd = CL_SEMAPHORE_HANDLE_OPAQUE_FD_KHR,
+#endif
+#ifdef cl_khr_external_semaphore_sync_fd
+    SyncFd = CL_SEMAPHORE_HANDLE_SYNC_FD_KHR,
+#endif
+#ifdef cl_khr_external_semaphore_win32
+    OpaqueWin32 = CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KHR,
+    OpaqueWin32Kmt = CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KMT_KHR,
+#endif // cl_khr_external_semaphore_win32
+};
+#endif // cl_khr_external_semaphore
 
+class Semaphore : public detail::Wrapper<cl_semaphore_khr>
+{
+public:
+    Semaphore() : detail::Wrapper<cl_type>() {}
+    Semaphore(
+        const Context &context,
+        const vector<cl_semaphore_properties_khr>& sema_props,
+        cl_int *err = nullptr) 
+    {
+        /* initialization of addresses to extension functions (it is done only once) */
+        std::call_once(ext_init_, initExtensions, context);
+
+        cl_int error = CL_INVALID_OPERATION;
+
+        if (pfn_clCreateSemaphoreWithPropertiesKHR)
+        {
+            object_ = pfn_clCreateSemaphoreWithPropertiesKHR(
+                context(),
+                sema_props.data(),
+                &error);
+        }
+          
+        detail::errHandler(error, __CREATE_SEMAPHORE_KHR_WITH_PROPERTIES_ERR);
+
+        if (err != nullptr) {
+            *err = error;
+        }
+    }
+    Semaphore(
+        const vector<cl_semaphore_properties_khr>& sema_props,
+        cl_int* err = nullptr):Semaphore(Context::getDefault(err), sema_props, err) {}
+    
+    explicit Semaphore(const cl_semaphore_khr& semaphore, bool retainObject = false) :
+        detail::Wrapper<cl_type>(semaphore, retainObject) {}
+    Semaphore& operator = (const cl_semaphore_khr& rhs) {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+    template <typename T>
+    cl_int getInfo(cl_semaphore_info_khr name, T* param) const
+    {
+        if (pfn_clGetSemaphoreInfoKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                                      __GET_SEMAPHORE_KHR_INFO_ERR);
+        }
+
+        return detail::errHandler(
+            detail::getInfo(pfn_clGetSemaphoreInfoKHR, object_, name, param),
+            __GET_SEMAPHORE_KHR_INFO_ERR);
+    }
+    template <cl_semaphore_info_khr name> typename
+    detail::param_traits<detail::cl_semaphore_info_khr, name>::param_type
+    getInfo(cl_int* err = nullptr) const
+    {
+        typename detail::param_traits<
+            detail::cl_semaphore_info_khr, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != nullptr) {
+            *err = result;        
+        }
+        return param;      
+    }
+
+#ifdef cl_khr_external_semaphore
+    template <typename T>
+    cl_int getHandleForTypeKHR(
+        const Device& device, cl_external_semaphore_handle_type_khr name, T* param) const
+    {
+        if (pfn_clGetSemaphoreHandleForTypeKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                                      __GET_SEMAPHORE_HANDLE_FOR_TYPE_KHR_ERR);
+        }
+
+        return detail::errHandler(
+            detail::getInfo(
+                pfn_clGetSemaphoreHandleForTypeKHR, object_, device(), name, param),
+                __GET_SEMAPHORE_HANDLE_FOR_TYPE_KHR_ERR);
+    }
+
+    template <cl_external_semaphore_handle_type_khr type> typename
+    detail::param_traits<detail::cl_external_semaphore_handle_type_khr, type>::param_type
+        getHandleForTypeKHR(const Device& device, cl_int* err = nullptr) const
+    {
+        typename detail::param_traits<
+        detail::cl_external_semaphore_handle_type_khr, type>::param_type param;
+        cl_int result = getHandleForTypeKHR(device, type, &param);
+        if (err != nullptr) {
+            *err = result;
+        }
+        return param;
+    }
+#endif // cl_khr_external_semaphore
+
+    cl_int retain()
+    { 
+        if (pfn_clRetainSemaphoreKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                                      __RETAIN_SEMAPHORE_KHR_ERR);
+        }
+        return pfn_clRetainSemaphoreKHR(object_);
+    }
+
+    cl_int release()
+    { 
+        if (pfn_clReleaseSemaphoreKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                                      __RELEASE_SEMAPHORE_KHR_ERR);
+        }
+        return pfn_clReleaseSemaphoreKHR(object_);
+    }
+
+private:
+    static std::once_flag ext_init_;
+
+    static void initExtensions(const Context& context)
+    {
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+        Device device = context.getInfo<CL_CONTEXT_DEVICES>().at(0);
+        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>()();
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCreateSemaphoreWithPropertiesKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clReleaseSemaphoreKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clRetainSemaphoreKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueWaitSemaphoresKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueSignalSemaphoresKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clGetSemaphoreInfoKHR);
+#ifdef cl_khr_external_semaphore
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clGetSemaphoreHandleForTypeKHR);
+#endif // cl_khr_external_semaphore
+
+#else
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateSemaphoreWithPropertiesKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clReleaseSemaphoreKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clRetainSemaphoreKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clEnqueueWaitSemaphoresKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clEnqueueSignalSemaphoresKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetSemaphoreInfoKHR);
+#ifdef cl_khr_external_semaphore
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetSemaphoreHandleForTypeKHR);
+#endif // cl_khr_external_semaphore
+
+#endif
+        if ((pfn_clCreateSemaphoreWithPropertiesKHR == nullptr) &&
+            (pfn_clReleaseSemaphoreKHR              == nullptr) &&
+            (pfn_clRetainSemaphoreKHR               == nullptr) &&
+            (pfn_clEnqueueWaitSemaphoresKHR         == nullptr) &&
+            (pfn_clEnqueueSignalSemaphoresKHR       == nullptr) &&
+#ifdef cl_khr_external_semaphore
+            (pfn_clGetSemaphoreHandleForTypeKHR     == nullptr) &&
+#endif // cl_khr_external_semaphore
+            (pfn_clGetSemaphoreInfoKHR              == nullptr))
+        {
+            detail::errHandler(CL_INVALID_VALUE, __CREATE_SEMAPHORE_KHR_WITH_PROPERTIES_ERR);
+        }
+    }
+
+};
+
+CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag Semaphore::ext_init_;
+
+inline cl_int CommandQueue::enqueueWaitSemaphores(
+    const vector<Semaphore> &sema_objects,
+    const vector<cl_semaphore_payload_khr> &sema_payloads,
+    const vector<Event>* events_wait_list,
+    Event *event) const
+{
+    cl_event tmp;
+    cl_int err = CL_INVALID_OPERATION;
+
+    if (pfn_clEnqueueWaitSemaphoresKHR != nullptr) {
+        err = pfn_clEnqueueWaitSemaphoresKHR(
+                object_,
+                (cl_uint)sema_objects.size(),
+                (const cl_semaphore_khr *) &sema_objects.front(),
+                (sema_payloads.size() > 0) ? &sema_payloads.front() : nullptr,
+                (events_wait_list != nullptr) ? (cl_uint) events_wait_list->size() : 0,
+                (events_wait_list != nullptr && events_wait_list->size() > 0) ? (cl_event*) &events_wait_list->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr);
+    }
+
+    detail::errHandler(err, __ENQUEUE_WAIT_SEMAPHORE_KHR_ERR);
+
+    if (event != nullptr && err == CL_SUCCESS)
+        *event = tmp;
+
+    return err;
+}
+
+inline cl_int CommandQueue::enqueueSignalSemaphores(
+    const vector<Semaphore> &sema_objects,
+    const vector<cl_semaphore_payload_khr>& sema_payloads,
+    const vector<Event>* events_wait_list,
+    Event* event)
+{
+    cl_event tmp;
+    cl_int err = CL_INVALID_OPERATION;
+
+    if (pfn_clEnqueueSignalSemaphoresKHR != nullptr) {
+        err = pfn_clEnqueueSignalSemaphoresKHR(
+                object_,
+                (cl_uint)sema_objects.size(),
+                (const cl_semaphore_khr*) &sema_objects.front(),
+                (sema_payloads.size() > 0) ? &sema_payloads.front() : nullptr,
+                (events_wait_list != nullptr) ? (cl_uint) events_wait_list->size() : 0,
+                (events_wait_list != nullptr && events_wait_list->size() > 0) ? (cl_event*) &events_wait_list->front() : nullptr,
+                (event != nullptr) ? &tmp : nullptr);
+    }
+
+    detail::errHandler(err, __ENQUEUE_SIGNAL_SEMAPHORE_KHR_ERR);
+
+    if (event != nullptr && err == CL_SUCCESS)
+        *event = tmp;
+
+    return err;
+}
+
+#endif // cl_khr_semaphore
+
+#if defined(cl_khr_command_buffer)
+/*! \class CommandBufferKhr
+ * \brief CommandBufferKhr interface for cl_command_buffer_khr.
+ */
+class CommandBufferKhr : public detail::Wrapper<cl_command_buffer_khr>
+{
+public:
+    //! \brief Default constructor - initializes to nullptr.
+    CommandBufferKhr() : detail::Wrapper<cl_type>() { }
+
+    explicit CommandBufferKhr(const vector<CommandQueue> &queues,
+        cl_command_buffer_properties_khr properties = 0,
+        cl_int* errcode_ret = nullptr)
+    {
+        cl_command_buffer_properties_khr command_buffer_properties[] = {
+            CL_COMMAND_BUFFER_FLAGS_KHR, properties, 0
+        };
+
+        /* initialization of addresses to extension functions (it is done only once) */
+        std::call_once(ext_init_, [&] { initExtensions(queues[0].getInfo<CL_QUEUE_DEVICE>()); });
+        cl_int error = CL_INVALID_OPERATION;
+
+        static_assert(sizeof(cl::CommandQueue) == sizeof(cl_command_queue),
+            "Size of cl::CommandQueue must be equal to size of cl_command_queue");
+
+        if (pfn_clCreateCommandBufferKHR)
+        {
+            object_ = pfn_clCreateCommandBufferKHR((cl_uint) queues.size(),
+                (cl_command_queue *) &queues.front(),
+                command_buffer_properties,
+                &error);
+        }
+
+        detail::errHandler(error, __CREATE_COMMAND_BUFFER_KHR_ERR);
+        if (errcode_ret != nullptr) {
+            *errcode_ret = error;
+        }
+    }
+
+    explicit CommandBufferKhr(const cl_command_buffer_khr& commandBufferKhr, bool retainObject = false) :
+        detail::Wrapper<cl_type>(commandBufferKhr, retainObject) { }
+
+    CommandBufferKhr& operator=(const cl_command_buffer_khr& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_command_buffer_info_khr name, T* param) const
+    {
+        if (pfn_clGetCommandBufferInfoKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                    __GET_COMMAND_BUFFER_INFO_KHR_ERR);
+        }
+        return detail::errHandler(
+            detail::getInfo(pfn_clGetCommandBufferInfoKHR, object_, name, param),
+                __GET_COMMAND_BUFFER_INFO_KHR_ERR);
+    }
+
+    template <cl_command_buffer_info_khr name> typename
+        detail::param_traits<detail::cl_command_buffer_info_khr, name>::param_type
+        getInfo(cl_int* err = nullptr) const
+    {
+        typename detail::param_traits<
+            detail::cl_command_buffer_info_khr, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != nullptr) {
+            *err = result;
+        }
+        return param;
+    }
+
+    cl_int finalizeCommandBuffer() const
+    {
+        return detail::errHandler(::clFinalizeCommandBufferKHR(object_), __FINALIZE_COMMAND_BUFFER_KHR_ERR);
+    }
+
+    cl_int enqueueCommandBuffer(vector<CommandQueue> &queues,
+        const vector<Event>* events = nullptr,
+        Event* event = nullptr)
+    {
+        if (pfn_clEnqueueCommandBufferKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                    __ENQUEUE_COMMAND_BUFFER_KHR_ERR);
+        }
+
+         static_assert(sizeof(cl::CommandQueue) == sizeof(cl_command_queue),
+            "Size of cl::CommandQueue must be equal to size of cl_command_queue");
+
+        return detail::errHandler(pfn_clEnqueueCommandBufferKHR((cl_uint) queues.size(),
+                (cl_command_queue *) &queues.front(),
+                object_,
+                (events != nullptr) ? (cl_uint) events->size() : 0,
+                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
+                (cl_event*) event),
+                __ENQUEUE_COMMAND_BUFFER_KHR_ERR);
+    }
+
+    cl_int commandBarrierWithWaitList(const vector<cl_sync_point_khr>* sync_points_vec = nullptr,
+        cl_sync_point_khr* sync_point = nullptr,
+        MutableCommandKhr* mutable_handle = nullptr,
+        const CommandQueue* command_queue = nullptr)
+    {
+        if (pfn_clCommandBarrierWithWaitListKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                    __COMMAND_BARRIER_WITH_WAIT_LIST_KHR_ERR);
+        }
+
+        cl_sync_point_khr tmp_sync_point;
+        cl_int error = detail::errHandler(
+            pfn_clCommandBarrierWithWaitListKHR(object_,
+                (command_queue != nullptr) ? (*command_queue)() : nullptr,
+                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,
+                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,
+                (sync_point != nullptr) ? &tmp_sync_point : nullptr,
+                (cl_mutable_command_khr*) mutable_handle),
+            __COMMAND_BARRIER_WITH_WAIT_LIST_KHR_ERR);
+
+        if (sync_point != nullptr && error == CL_SUCCESS)
+            *sync_point = tmp_sync_point;
+
+        return error;
+    }
+
+    cl_int commandCopyBuffer(const Buffer& src,
+        const Buffer& dst,
+        size_type src_offset,
+        size_type dst_offset,
+        size_type size,
+        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,
+        cl_sync_point_khr* sync_point = nullptr,
+        MutableCommandKhr* mutable_handle = nullptr,
+        const CommandQueue* command_queue = nullptr)
+    {
+        if (pfn_clCommandCopyBufferKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                    __COMMAND_COPY_BUFFER_KHR_ERR);
+        }
+
+        cl_sync_point_khr tmp_sync_point;
+        cl_int error = detail::errHandler(
+            pfn_clCommandCopyBufferKHR(object_,
+                (command_queue != nullptr) ? (*command_queue)() : nullptr,
+                src(),
+                dst(),
+                src_offset,
+                dst_offset,
+                size,
+                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,
+                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,
+                (sync_point != nullptr) ? &tmp_sync_point : nullptr,
+                (cl_mutable_command_khr*) mutable_handle),
+            __COMMAND_COPY_BUFFER_KHR_ERR);
+
+        if (sync_point != nullptr && error == CL_SUCCESS)
+            *sync_point = tmp_sync_point;
+
+        return error;
+    }
+
+    cl_int commandCopyBufferRect(const Buffer& src,
+        const Buffer& dst,
+        const array<size_type, 3>& src_origin,
+        const array<size_type, 3>& dst_origin,
+        const array<size_type, 3>& region,
+        size_type src_row_pitch,
+        size_type src_slice_pitch,
+        size_type dst_row_pitch,
+        size_type dst_slice_pitch,
+        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,
+        cl_sync_point_khr* sync_point = nullptr,
+        MutableCommandKhr* mutable_handle = nullptr,
+        const CommandQueue* command_queue = nullptr)
+    {
+        if (pfn_clCommandCopyBufferRectKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                    __COMMAND_COPY_BUFFER_RECT_KHR_ERR);
+        }
+
+        cl_sync_point_khr tmp_sync_point;
+        cl_int error = detail::errHandler(
+            pfn_clCommandCopyBufferRectKHR(object_,
+                (command_queue != nullptr) ? (*command_queue)() : nullptr,
+                src(),
+                dst(),
+                src_origin.data(),
+                dst_origin.data(),
+                region.data(),
+                src_row_pitch,
+                src_slice_pitch,
+                dst_row_pitch,
+                dst_slice_pitch,
+                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,
+                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,
+                (sync_point != nullptr) ? &tmp_sync_point : nullptr,
+                (cl_mutable_command_khr*) mutable_handle),
+            __COMMAND_COPY_BUFFER_RECT_KHR_ERR);
+
+        if (sync_point != nullptr && error == CL_SUCCESS)
+            *sync_point = tmp_sync_point;
+
+        return error;
+    }
+
+    cl_int commandCopyBufferToImage(const Buffer& src,
+        const Image& dst,
+        size_type src_offset,
+        const array<size_type, 3>& dst_origin,
+        const array<size_type, 3>& region,
+        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,
+        cl_sync_point_khr* sync_point = nullptr,
+        MutableCommandKhr* mutable_handle = nullptr,
+        const CommandQueue* command_queue = nullptr)
+    {
+        if (pfn_clCommandCopyBufferToImageKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                    __COMMAND_COPY_BUFFER_TO_IMAGE_KHR_ERR);
+        }
+
+        cl_sync_point_khr tmp_sync_point;
+        cl_int error = detail::errHandler(
+            pfn_clCommandCopyBufferToImageKHR(object_,
+                (command_queue != nullptr) ? (*command_queue)() : nullptr,
+                src(),
+                dst(),
+                src_offset,
+                dst_origin.data(),
+                region.data(),
+                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,
+                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,
+                (sync_point != nullptr) ? &tmp_sync_point : nullptr,
+                (cl_mutable_command_khr*) mutable_handle),
+            __COMMAND_COPY_BUFFER_TO_IMAGE_KHR_ERR);
+
+        if (sync_point != nullptr && error == CL_SUCCESS)
+            *sync_point = tmp_sync_point;
+
+        return error;
+    }
+
+    cl_int commandCopyImage(const Image& src,
+        const Image& dst,
+        const array<size_type, 3>& src_origin,
+        const array<size_type, 3>& dst_origin,
+        const array<size_type, 3>& region,
+        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,
+        cl_sync_point_khr* sync_point = nullptr,
+        MutableCommandKhr* mutable_handle = nullptr,
+        const CommandQueue* command_queue = nullptr)
+    {
+        if (pfn_clCommandCopyImageKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                    __COMMAND_COPY_IMAGE_KHR_ERR);
+        }
+
+        cl_sync_point_khr tmp_sync_point;
+        cl_int error = detail::errHandler(
+            pfn_clCommandCopyImageKHR(object_,
+                (command_queue != nullptr) ? (*command_queue)() : nullptr,
+                src(),
+                dst(),
+                src_origin.data(),
+                dst_origin.data(),
+                region.data(),
+                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,
+                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,
+                (sync_point != nullptr) ? &tmp_sync_point : nullptr,
+                (cl_mutable_command_khr*) mutable_handle),
+            __COMMAND_COPY_IMAGE_KHR_ERR);
+
+        if (sync_point != nullptr && error == CL_SUCCESS)
+            *sync_point = tmp_sync_point;
+
+        return error;
+    }
+
+    cl_int commandCopyImageToBuffer(const Image& src,
+        const Buffer& dst,
+        const array<size_type, 3>& src_origin,
+        const array<size_type, 3>& region,
+        size_type dst_offset,
+        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,
+        cl_sync_point_khr* sync_point = nullptr,
+        MutableCommandKhr* mutable_handle = nullptr,
+        const CommandQueue* command_queue = nullptr)
+    {
+        if (pfn_clCommandCopyImageToBufferKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                    __COMMAND_COPY_IMAGE_TO_BUFFER_KHR_ERR);
+        }
+
+        cl_sync_point_khr tmp_sync_point;
+        cl_int error = detail::errHandler(
+            pfn_clCommandCopyImageToBufferKHR(object_,
+                (command_queue != nullptr) ? (*command_queue)() : nullptr,
+                src(),
+                dst(),
+                src_origin.data(),
+                region.data(),
+                dst_offset,
+                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,
+                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,
+                (sync_point != nullptr) ? &tmp_sync_point : nullptr,
+                (cl_mutable_command_khr*) mutable_handle),
+            __COMMAND_COPY_IMAGE_TO_BUFFER_KHR_ERR);
+
+        if (sync_point != nullptr && error == CL_SUCCESS)
+            *sync_point = tmp_sync_point;
+
+        return error;
+    }
+
+    template<typename PatternType>
+    cl_int commandFillBuffer(const Buffer& buffer,
+        PatternType pattern,
+        size_type offset,
+        size_type size,
+        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,
+        cl_sync_point_khr* sync_point = nullptr,
+        MutableCommandKhr* mutable_handle = nullptr,
+        const CommandQueue* command_queue = nullptr)
+    {
+        if (pfn_clCommandFillBufferKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                    __COMMAND_FILL_BUFFER_KHR_ERR);
+        }
+
+        cl_sync_point_khr tmp_sync_point;
+        cl_int error = detail::errHandler(
+            pfn_clCommandFillBufferKHR(object_,
+                (command_queue != nullptr) ? (*command_queue)() : nullptr,
+                buffer(),
+                static_cast<void*>(&pattern),
+                sizeof(PatternType),
+                offset,
+                size,
+                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,
+                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,
+                (sync_point != nullptr) ? &tmp_sync_point : nullptr,
+                (cl_mutable_command_khr*) mutable_handle),
+            __COMMAND_FILL_BUFFER_KHR_ERR);
+
+        if (sync_point != nullptr && error == CL_SUCCESS)
+            *sync_point = tmp_sync_point;
+
+        return error;
+    }
+
+    cl_int commandFillImage(const Image& image,
+        cl_float4 fillColor,
+        const array<size_type, 3>& origin,
+        const array<size_type, 3>& region,
+        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,
+        cl_sync_point_khr* sync_point = nullptr,
+        MutableCommandKhr* mutable_handle = nullptr,
+        const CommandQueue* command_queue = nullptr)
+    {
+        if (pfn_clCommandFillImageKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                    __COMMAND_FILL_IMAGE_KHR_ERR);
+        }
+
+        cl_sync_point_khr tmp_sync_point;
+        cl_int error = detail::errHandler(
+            pfn_clCommandFillImageKHR(object_,
+                (command_queue != nullptr) ? (*command_queue)() : nullptr,
+                image(),
+                static_cast<void*>(&fillColor),
+                origin.data(),
+                region.data(),
+                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,
+                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,
+                (sync_point != nullptr) ? &tmp_sync_point : nullptr,
+                (cl_mutable_command_khr*) mutable_handle),
+            __COMMAND_FILL_IMAGE_KHR_ERR);
+
+        if (sync_point != nullptr && error == CL_SUCCESS)
+            *sync_point = tmp_sync_point;
+
+        return error;
+    }
+
+    cl_int commandNDRangeKernel(const cl::vector<cl_ndrange_kernel_command_properties_khr> &properties,
+        const Kernel& kernel,
+        const NDRange& offset,
+        const NDRange& global,
+        const NDRange& local = NullRange,
+        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,
+        cl_sync_point_khr* sync_point = nullptr,
+        MutableCommandKhr* mutable_handle = nullptr,
+        const CommandQueue* command_queue = nullptr)
+    {
+        if (pfn_clCommandNDRangeKernelKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                    __COMMAND_NDRANGE_KERNEL_KHR_ERR);
+        }
+
+        cl_sync_point_khr tmp_sync_point;
+        cl_int error = detail::errHandler(
+            pfn_clCommandNDRangeKernelKHR(object_,
+                (command_queue != nullptr) ? (*command_queue)() : nullptr,
+                &properties[0],
+                kernel(),
+                (cl_uint) global.dimensions(),
+                offset.dimensions() != 0 ? (const size_type*) offset : nullptr,
+                (const size_type*) global,
+                local.dimensions() != 0 ? (const size_type*) local : nullptr,
+                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,
+                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,
+                (sync_point != nullptr) ? &tmp_sync_point : nullptr,
+                (cl_mutable_command_khr*) mutable_handle),
+            __COMMAND_NDRANGE_KERNEL_KHR_ERR);
+
+        if (sync_point != nullptr && error == CL_SUCCESS)
+            *sync_point = tmp_sync_point;
+
+        return error;
+    }
+
+#if defined(cl_khr_command_buffer_mutable_dispatch)
+    cl_int updateMutableCommands(const cl_mutable_base_config_khr* mutable_config)
+    {
+        if (pfn_clUpdateMutableCommandsKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                    __UPDATE_MUTABLE_COMMANDS_KHR_ERR);
+        }
+        return detail::errHandler(pfn_clUpdateMutableCommandsKHR(object_, mutable_config),
+                        __UPDATE_MUTABLE_COMMANDS_KHR_ERR);
+    }
+#endif /* cl_khr_command_buffer_mutable_dispatch */
+
+private:
+    static std::once_flag ext_init_;
+
+    static void initExtensions(const cl::Device& device)
+    {
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>()();
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCreateCommandBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clFinalizeCommandBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clRetainCommandBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clReleaseCommandBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clGetCommandBufferInfoKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueCommandBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandBarrierWithWaitListKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandCopyBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandCopyBufferRectKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandCopyBufferToImageKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandCopyImageKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandCopyImageToBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandFillBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandFillImageKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandNDRangeKernelKHR);
+#if defined(cl_khr_command_buffer_mutable_dispatch)
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clUpdateMutableCommandsKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clGetMutableCommandInfoKHR);
+#endif /* cl_khr_command_buffer_mutable_dispatch */
+#elif CL_HPP_TARGET_OPENCL_VERSION >= 110
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateCommandBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clFinalizeCommandBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clRetainCommandBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clReleaseCommandBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetCommandBufferInfoKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clEnqueueCommandBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandBarrierWithWaitListKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandCopyBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandCopyBufferRectKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandCopyBufferToImageKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandCopyImageKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandCopyImageToBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandFillBufferKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandFillImageKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandNDRangeKernelKHR);
+#if defined(cl_khr_command_buffer_mutable_dispatch)
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clUpdateMutableCommandsKHR);
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetMutableCommandInfoKHR);
+#endif /* cl_khr_command_buffer_mutable_dispatch */
+#endif
+        if ((pfn_clCreateCommandBufferKHR        == nullptr) &&
+            (pfn_clFinalizeCommandBufferKHR      == nullptr) &&
+            (pfn_clRetainCommandBufferKHR        == nullptr) &&
+            (pfn_clReleaseCommandBufferKHR       == nullptr) &&
+            (pfn_clGetCommandBufferInfoKHR       == nullptr) &&
+            (pfn_clEnqueueCommandBufferKHR       == nullptr) &&
+            (pfn_clCommandBarrierWithWaitListKHR == nullptr) &&
+            (pfn_clCommandCopyBufferKHR          == nullptr) &&
+            (pfn_clCommandCopyBufferRectKHR      == nullptr) &&
+            (pfn_clCommandCopyBufferToImageKHR   == nullptr) &&
+            (pfn_clCommandCopyImageKHR           == nullptr) &&
+            (pfn_clCommandCopyImageToBufferKHR   == nullptr) &&
+            (pfn_clCommandFillBufferKHR          == nullptr) &&
+            (pfn_clCommandFillImageKHR           == nullptr) &&
+            (pfn_clCommandNDRangeKernelKHR       == nullptr)
+#if defined(cl_khr_command_buffer_mutable_dispatch)
+            && (pfn_clUpdateMutableCommandsKHR      == nullptr)
+            && (pfn_clGetMutableCommandInfoKHR      == nullptr)
+#endif /* cl_khr_command_buffer_mutable_dispatch */
+            )
+        {
+            detail::errHandler(CL_INVALID_VALUE, __CREATE_COMMAND_BUFFER_KHR_ERR);
+        }
+    }
+}; // CommandBufferKhr
+
+CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag CommandBufferKhr::ext_init_;
+
+#if defined(cl_khr_command_buffer_mutable_dispatch)
+/*! \class MutableCommandKhr
+ * \brief MutableCommandKhr interface for cl_mutable_command_khr.
+ */
+class MutableCommandKhr : public detail::Wrapper<cl_mutable_command_khr>
+{
+public:
+    //! \brief Default constructor - initializes to nullptr.
+    MutableCommandKhr() : detail::Wrapper<cl_type>() { }
+
+    explicit MutableCommandKhr(const cl_mutable_command_khr& mutableCommandKhr, bool retainObject = false) :
+        detail::Wrapper<cl_type>(mutableCommandKhr, retainObject) { }
+
+    MutableCommandKhr& operator=(const cl_mutable_command_khr& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_mutable_command_info_khr name, T* param) const
+    {
+        if (pfn_clGetMutableCommandInfoKHR == nullptr) {
+            return detail::errHandler(CL_INVALID_OPERATION,
+                    __GET_MUTABLE_COMMAND_INFO_KHR_ERR);
+        }
+        return detail::errHandler(
+            detail::getInfo(pfn_clGetMutableCommandInfoKHR, object_, name, param),
+                __GET_MUTABLE_COMMAND_INFO_KHR_ERR);
+    }
+
+    template <cl_mutable_command_info_khr name> typename
+        detail::param_traits<detail::cl_mutable_command_info_khr, name>::param_type
+        getInfo(cl_int* err = nullptr) const
+    {
+        typename detail::param_traits<
+            detail::cl_mutable_command_info_khr, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != nullptr) {
+            *err = result;
+        }
+        return param;
+    }
+}; // MutableCommandKhr
+#endif /* cl_khr_command_buffer_mutable_dispatch */
+
+#endif // cl_khr_command_buffer
 //----------------------------------------------------------------------------------------------------------------------
 
 #undef CL_HPP_ERR_STR_
 #if !defined(CL_HPP_USER_OVERRIDE_ERROR_STRINGS)
-#undef __GET_DEVICE_INFO_ERR
-#undef __GET_PLATFORM_INFO_ERR
-#undef __GET_DEVICE_IDS_ERR
-#undef __GET_CONTEXT_INFO_ERR
-#undef __GET_EVENT_INFO_ERR
-#undef __GET_EVENT_PROFILE_INFO_ERR
-#undef __GET_MEM_OBJECT_INFO_ERR
-#undef __GET_IMAGE_INFO_ERR
-#undef __GET_SAMPLER_INFO_ERR
-#undef __GET_KERNEL_INFO_ERR
-#undef __GET_KERNEL_ARG_INFO_ERR
-#undef __GET_KERNEL_WORK_GROUP_INFO_ERR
-#undef __GET_PROGRAM_INFO_ERR
-#undef __GET_PROGRAM_BUILD_INFO_ERR
-#undef __GET_COMMAND_QUEUE_INFO_ERR
-
-#undef __CREATE_CONTEXT_ERR
+#undef __GET_DEVICE_INFO_ERR               
+#undef __GET_PLATFORM_INFO_ERR             
+#undef __GET_DEVICE_IDS_ERR                
+#undef __GET_PLATFORM_IDS_ERR              
+#undef __GET_CONTEXT_INFO_ERR              
+#undef __GET_EVENT_INFO_ERR                
+#undef __GET_EVENT_PROFILE_INFO_ERR        
+#undef __GET_MEM_OBJECT_INFO_ERR           
+#undef __GET_IMAGE_INFO_ERR                
+#undef __GET_SAMPLER_INFO_ERR              
+#undef __GET_KERNEL_INFO_ERR               
+#undef __GET_KERNEL_ARG_INFO_ERR           
+#undef __GET_KERNEL_SUB_GROUP_INFO_ERR     
+#undef __GET_KERNEL_WORK_GROUP_INFO_ERR    
+#undef __GET_PROGRAM_INFO_ERR              
+#undef __GET_PROGRAM_BUILD_INFO_ERR        
+#undef __GET_COMMAND_QUEUE_INFO_ERR        
+#undef __CREATE_CONTEXT_ERR                
 #undef __CREATE_CONTEXT_FROM_TYPE_ERR
-#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR
-
-#undef __CREATE_BUFFER_ERR
-#undef __CREATE_SUBBUFFER_ERR
-#undef __CREATE_IMAGE2D_ERR
-#undef __CREATE_IMAGE3D_ERR
-#undef __CREATE_SAMPLER_ERR
-#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR
-
-#undef __CREATE_USER_EVENT_ERR
-#undef __SET_USER_EVENT_STATUS_ERR
-#undef __SET_EVENT_CALLBACK_ERR
-#undef __SET_PRINTF_CALLBACK_ERR
-
-#undef __WAIT_FOR_EVENTS_ERR
-
-#undef __CREATE_KERNEL_ERR
-#undef __SET_KERNEL_ARGS_ERR
-#undef __CREATE_PROGRAM_WITH_SOURCE_ERR
-#undef __CREATE_PROGRAM_WITH_BINARY_ERR
-#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR
-#undef __BUILD_PROGRAM_ERR
-#undef __CREATE_KERNELS_IN_PROGRAM_ERR
-
-#undef __CREATE_COMMAND_QUEUE_ERR
-#undef __SET_COMMAND_QUEUE_PROPERTY_ERR
-#undef __ENQUEUE_READ_BUFFER_ERR
-#undef __ENQUEUE_WRITE_BUFFER_ERR
-#undef __ENQUEUE_READ_BUFFER_RECT_ERR
-#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR
-#undef __ENQEUE_COPY_BUFFER_ERR
-#undef __ENQEUE_COPY_BUFFER_RECT_ERR
-#undef __ENQUEUE_READ_IMAGE_ERR
-#undef __ENQUEUE_WRITE_IMAGE_ERR
-#undef __ENQUEUE_COPY_IMAGE_ERR
-#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR
-#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR
+#undef __CREATE_COMMAND_BUFFER_KHR_ERR
+#undef __GET_COMMAND_BUFFER_INFO_KHR_ERR
+#undef __FINALIZE_COMMAND_BUFFER_KHR_ERR
+#undef __ENQUEUE_COMMAND_BUFFER_KHR_ERR
+#undef __COMMAND_BARRIER_WITH_WAIT_LIST_KHR_ERR
+#undef __COMMAND_COPY_BUFFER_KHR_ERR
+#undef __COMMAND_COPY_BUFFER_RECT_KHR_ERR
+#undef __COMMAND_COPY_BUFFER_TO_IMAGE_KHR_ERR
+#undef __COMMAND_COPY_IMAGE_KHR_ERR
+#undef __COMMAND_COPY_IMAGE_TO_BUFFER_KHR_ERR
+#undef __COMMAND_FILL_BUFFER_KHR_ERR
+#undef __COMMAND_FILL_IMAGE_KHR_ERR
+#undef __COMMAND_NDRANGE_KERNEL_KHR_ERR
+#undef __UPDATE_MUTABLE_COMMANDS_KHR_ERR
+#undef __GET_MUTABLE_COMMAND_INFO_KHR_ERR
+#undef __RETAIN_COMMAND_BUFFER_KHR_ERR
+#undef __RELEASE_COMMAND_BUFFER_KHR_ERR
+#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR   
+#undef __SET_CONTEXT_DESCTRUCTOR_CALLBACK_ERR
+#undef __CREATE_BUFFER_ERR                 
+#undef __COPY_ERR                          
+#undef __CREATE_SUBBUFFER_ERR              
+#undef __CREATE_GL_BUFFER_ERR              
+#undef __CREATE_GL_RENDER_BUFFER_ERR       
+#undef __GET_GL_OBJECT_INFO_ERR            
+#undef __CREATE_IMAGE_ERR                  
+#undef __CREATE_GL_TEXTURE_ERR             
+#undef __IMAGE_DIMENSION_ERR               
+#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR 
+#undef __CREATE_USER_EVENT_ERR             
+#undef __SET_USER_EVENT_STATUS_ERR         
+#undef __SET_EVENT_CALLBACK_ERR            
+#undef __WAIT_FOR_EVENTS_ERR               
+#undef __CREATE_KERNEL_ERR                 
+#undef __SET_KERNEL_ARGS_ERR               
+#undef __CREATE_PROGRAM_WITH_SOURCE_ERR    
+#undef __CREATE_PROGRAM_WITH_BINARY_ERR    
+#undef __CREATE_PROGRAM_WITH_IL_ERR        
+#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR    
+#undef __BUILD_PROGRAM_ERR                 
+#undef __COMPILE_PROGRAM_ERR               
+#undef __LINK_PROGRAM_ERR                  
+#undef __CREATE_KERNELS_IN_PROGRAM_ERR     
+#undef __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR          
+#undef __CREATE_SAMPLER_WITH_PROPERTIES_ERR                
+#undef __SET_COMMAND_QUEUE_PROPERTY_ERR    
+#undef __ENQUEUE_READ_BUFFER_ERR           
+#undef __ENQUEUE_READ_BUFFER_RECT_ERR      
+#undef __ENQUEUE_WRITE_BUFFER_ERR          
+#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR     
+#undef __ENQEUE_COPY_BUFFER_ERR            
+#undef __ENQEUE_COPY_BUFFER_RECT_ERR       
+#undef __ENQUEUE_FILL_BUFFER_ERR           
+#undef __ENQUEUE_READ_IMAGE_ERR            
+#undef __ENQUEUE_WRITE_IMAGE_ERR           
+#undef __ENQUEUE_COPY_IMAGE_ERR            
+#undef __ENQUEUE_FILL_IMAGE_ERR            
+#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR  
+#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR  
 #undef __ENQUEUE_MAP_BUFFER_ERR
 #undef __ENQUEUE_MAP_IMAGE_ERR
-#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR
-#undef __ENQUEUE_NDRANGE_KERNEL_ERR
-#undef __ENQUEUE_TASK_ERR
-#undef __ENQUEUE_NATIVE_KERNEL
-
-#undef __UNLOAD_COMPILER_ERR
+#undef __ENQUEUE_MAP_SVM_ERR
+#undef __ENQUEUE_FILL_SVM_ERR
+#undef __ENQUEUE_COPY_SVM_ERR
+#undef __ENQUEUE_UNMAP_SVM_ERR              
+#undef __ENQUEUE_MAP_IMAGE_ERR             
+#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR      
+#undef __ENQUEUE_NDRANGE_KERNEL_ERR        
+#undef __ENQUEUE_NATIVE_KERNEL             
+#undef __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR   
+#undef __ENQUEUE_MIGRATE_SVM_ERR
+#undef __ENQUEUE_ACQUIRE_GL_ERR            
+#undef __ENQUEUE_RELEASE_GL_ERR            
+#undef __CREATE_PIPE_ERR             
+#undef __GET_PIPE_INFO_ERR           
+#undef __RETAIN_ERR                        
+#undef __RELEASE_ERR                       
+#undef __FLUSH_ERR                         
+#undef __FINISH_ERR                        
+#undef __VECTOR_CAPACITY_ERR               
 #undef __CREATE_SUB_DEVICES_ERR
-
-#undef __CREATE_PIPE_ERR
-#undef __GET_PIPE_INFO_ERR
+#undef __ENQUEUE_ACQUIRE_EXTERNAL_MEMORY_ERR
+#undef __ENQUEUE_RELEASE_EXTERNAL_MEMORY_ERR
+#undef __ENQUEUE_MARKER_ERR                
+#undef __ENQUEUE_WAIT_FOR_EVENTS_ERR       
+#undef __ENQUEUE_BARRIER_ERR               
+#undef __UNLOAD_COMPILER_ERR               
+#undef __CREATE_GL_TEXTURE_2D_ERR          
+#undef __CREATE_GL_TEXTURE_3D_ERR          
+#undef __CREATE_IMAGE2D_ERR                
+#undef __CREATE_IMAGE3D_ERR                
+#undef __CREATE_COMMAND_QUEUE_ERR          
+#undef __ENQUEUE_TASK_ERR                  
+#undef __CREATE_SAMPLER_ERR                
+#undef __ENQUEUE_MARKER_WAIT_LIST_ERR                
+#undef __ENQUEUE_BARRIER_WAIT_LIST_ERR               
+#undef __CLONE_KERNEL_ERR     
+#undef __GET_HOST_TIMER_ERR
+#undef __GET_DEVICE_AND_HOST_TIMER_ERR
+#undef __GET_SEMAPHORE_KHR_INFO_ERR
+#undef __CREATE_SEMAPHORE_KHR_WITH_PROPERTIES_ERR
+#undef __GET_IMAGE_REQUIREMENT_INFO_EXT_ERR
+#undef __ENQUEUE_WAIT_SEMAPHORE_KHR_ERR
+#undef __ENQUEUE_SIGNAL_SEMAPHORE_KHR_ERR
+#undef __RETAIN_SEMAPHORE_KHR_ERR
+#undef __RELEASE_SEMAPHORE_KHR_ERR
+#undef __GET_SEMAPHORE_HANDLE_FOR_TYPE_KHR_ERR
 
 #endif //CL_HPP_USER_OVERRIDE_ERROR_STRINGS
 
 // Extensions
+#undef CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_
 #undef CL_HPP_INIT_CL_EXT_FCN_PTR_
 #undef CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_
 
-#if defined(CL_HPP_USE_CL_DEVICE_FISSION)
-#undef CL_HPP_PARAM_NAME_DEVICE_FISSION_
-#endif // CL_HPP_USE_CL_DEVICE_FISSION
-
-#undef CL_HPP_NOEXCEPT_
 #undef CL_HPP_DEFINE_STATIC_MEMBER_
 
 } // namespace cl
diff --git a/third_party/pjrt_c_api.h b/third_party/pjrt_c_api.h
index f1ab16f1a6..3158073aea 100644
--- a/third_party/pjrt_c_api.h
+++ b/third_party/pjrt_c_api.h
@@ -16,21 +16,71 @@ limitations under the License.
 #ifndef XLA_PJRT_C_PJRT_C_API_H_
 #define XLA_PJRT_C_PJRT_C_API_H_
 
+#include <assert.h>
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
 
+// Read more on C API ABI versioning and compatibility here:
+// https://docs.google.com/document/d/1TKB5NyGtdzrpgw5mpyFjVAhJjpSNdF31T6pjPl_UT2o/edit?usp=sharing
+
 #define PJRT_STRUCT_SIZE(struct_type, last_field) \
   offsetof(struct_type, last_field) + sizeof(((struct_type*)0)->last_field)
 
-#define PJRT_DEFINE_STRUCT_TRAITS(sname, last_field) \
-  typedef struct sname sname;                        \
-  enum { sname##_STRUCT_SIZE = PJRT_STRUCT_SIZE(sname, last_field) }
+#ifdef __cplusplus
+#define PJRT_CHECK_STRUCT_SIZE(sname, last_field)                       \
+  static_assert(                                                        \
+      sizeof(struct sname) ==                                           \
+          ((PJRT_STRUCT_SIZE(sname, last_field) + alignof(sname) - 1) / \
+           alignof(sname)) *                                            \
+              alignof(sname),                                           \
+      "Failed to update last_field");
+#else
+#define PJRT_CHECK_STRUCT_SIZE(sname, last_field)
+#endif
+
+// Must update PJRT_DEFINE_STRUCT_TRAITS with the new `last_field` after
+// adding a new member to a struct.
+#define PJRT_DEFINE_STRUCT_TRAITS(sname, last_field)                  \
+  typedef struct sname sname;                                         \
+  enum { sname##_STRUCT_SIZE = PJRT_STRUCT_SIZE(sname, last_field) }; \
+  PJRT_CHECK_STRUCT_SIZE(sname, last_field)
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+// ------------------------------- Extensions ----------------------------------
+
+typedef enum {
+  PJRT_Extension_Type_Gpu_Custom_Call = 0,
+  PJRT_Extension_Type_Profiler,
+  PJRT_Extension_Type_Custom_Partitioner,
+  PJRT_Extension_Type_Stream,
+  PJRT_Extension_Type_Layouts,
+  PJRT_Extension_Type_FFI,
+  PJRT_Extension_Type_MemoryDescriptions,
+  PJRT_Extension_Type_Triton,
+  PJRT_Extension_Type_RawBuffer,     // Experimental.
+  PJRT_Extension_Type_PhaseCompile,  // Experimental.
+  PJRT_Extension_Type_Example,
+  PJRT_Extension_Type_Unknown,
+  PJRT_Extension_Type_CrossHostTransfers,
+  PJRT_Extension_Type_ExecutableMetadata,
+  PJRT_Extension_Type_Callback,
+  PJRT_Extension_Type_HostAllocator,  // Experimental.
+} PJRT_Extension_Type;
+
+// PJRT_Extension_Base contains a type and a pointer to next
+// PJRT_Extension_Base. The framework can go through this chain to find an
+// extension and identify it with the type.
+typedef struct PJRT_Extension_Base {
+  size_t struct_size;
+  PJRT_Extension_Type type;
+  struct PJRT_Extension_Base* next;
+} PJRT_Extension_Base;
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Extension_Base, next);
+
 // --------------------------------- Version -----------------------------------
 
 // Incremented when an ABI-incompatible change is made to the interface.
@@ -53,14 +103,14 @@ extern "C" {
 // Changes include:
 // * Adding a new field to the PJRT_Api or argument structs
 // * Renaming a method or argument (doesn't affect ABI)
-#define PJRT_API_MINOR 40
+#define PJRT_API_MINOR 80
 
 // The plugin should set the major_version and minor_version of
 // PJRT_Api.pjrt_api_version to be the `PJRT_API_MAJOR` and `PJRT_API_MINOR` in
 // this header that the implementation was compiled with.
 struct PJRT_Api_Version {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   int major_version;  // out
   int minor_version;  // out
 };
@@ -77,7 +127,7 @@ typedef struct PJRT_Error PJRT_Error;
 
 struct PJRT_Error_Destroy_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Error* error;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Error_Destroy_Args, error);
@@ -87,7 +137,7 @@ typedef void PJRT_Error_Destroy(PJRT_Error_Destroy_Args* args);
 
 struct PJRT_Error_Message_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   const PJRT_Error* error;
   // Has the lifetime of `error`.
   const char* message;  // out
@@ -101,6 +151,7 @@ typedef void PJRT_Error_Message(PJRT_Error_Message_Args* args);
 
 // Codes are based on https://abseil.io/docs/cpp/guides/status-codes
 typedef enum {
+  PJRT_Error_Code_OK = 0,
   PJRT_Error_Code_CANCELLED = 1,
   PJRT_Error_Code_UNKNOWN = 2,
   PJRT_Error_Code_INVALID_ARGUMENT = 3,
@@ -121,7 +172,7 @@ typedef enum {
 
 struct PJRT_Error_GetCode_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   const PJRT_Error* error;
   PJRT_Error_Code code;  // out
 };
@@ -151,7 +202,7 @@ typedef enum {
 // Named value for key-value pairs.
 struct PJRT_NamedValue {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   const char* name;
   size_t name_size;
   PJRT_NamedValue_Type type;
@@ -172,25 +223,25 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_NamedValue, value_size);
 
 struct PJRT_Plugin_Initialize_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
 };
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_Plugin_Initialize_Args, priv);
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Plugin_Initialize_Args, extension_start);
 
 // One-time plugin setup. Must be called before any other functions are called.
 typedef PJRT_Error* PJRT_Plugin_Initialize(PJRT_Plugin_Initialize_Args* args);
 
 struct PJRT_Plugin_Attributes_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   // Returned attributes have the lifetime of the process.
   const PJRT_NamedValue* attributes;  // out
   size_t num_attributes;              // out
 };
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_Plugin_Attributes_Args, attributes);
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Plugin_Attributes_Args, num_attributes);
 
-// Returns an array of plugin attributes which are key-value pairs. One example
-// attribute is the minimum supported StableHLO version.
-// TODO(b/280349977): standardize the list of attributes.
+// Returns an array of plugin attributes which are key-value pairs. Common keys
+// include `xla_version`, `stablehlo_current_version`, and
+// `stablehlo_minimum_version`.
 typedef PJRT_Error* PJRT_Plugin_Attributes(PJRT_Plugin_Attributes_Args* args);
 
 // ---------------------------------- Events -----------------------------------
@@ -205,7 +256,7 @@ typedef struct PJRT_Event PJRT_Event;
 
 struct PJRT_Event_Destroy_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Event* event;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Event_Destroy_Args, event);
@@ -215,7 +266,7 @@ typedef PJRT_Error* PJRT_Event_Destroy(PJRT_Event_Destroy_Args* args);
 
 struct PJRT_Event_IsReady_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Event* event;
   bool is_ready;  // out
 };
@@ -227,7 +278,7 @@ typedef PJRT_Error* PJRT_Event_IsReady(PJRT_Event_IsReady_Args* args);
 
 struct PJRT_Event_Error_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Event* event;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Event_Error_Args, event);
@@ -245,7 +296,7 @@ typedef PJRT_Error* PJRT_Event_Error(PJRT_Event_Error_Args* args);
 
 struct PJRT_Event_Await_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Event* event;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Event_Await_Args, event);
@@ -263,7 +314,7 @@ typedef void (*PJRT_Event_OnReadyCallback)(PJRT_Error* error, void* user_arg);
 
 struct PJRT_Event_OnReady_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Event* event;
   PJRT_Event_OnReadyCallback callback;
   // `user_arg` allows `callback` to be called with arbitrary arguments (e.g.
@@ -281,23 +332,28 @@ typedef PJRT_Error* PJRT_Event_OnReady(PJRT_Event_OnReady_Args* args);
 typedef struct PJRT_Client PJRT_Client;
 typedef struct PJRT_Device PJRT_Device;
 typedef struct PJRT_Memory PJRT_Memory;
+typedef struct PJRT_ShapeSpec PJRT_ShapeSpec;
 typedef struct PJRT_DeviceDescription PJRT_DeviceDescription;
 typedef struct PJRT_TopologyDescription PJRT_TopologyDescription;
 typedef struct PJRT_Executable PJRT_Executable;
 typedef struct PJRT_LoadedExecutable PJRT_LoadedExecutable;
 typedef struct PJRT_Buffer PJRT_Buffer;
+typedef struct PJRT_FulfillAliasBufferCallback PJRT_FulfillAliasBufferCallback;
+typedef struct PJRT_AsyncHostToDeviceTransferManager
+    PJRT_AsyncHostToDeviceTransferManager;
+typedef struct PJRT_PhaseCompiler PJRT_PhaseCompiler;
 
 // The caller of PJRT_Client_Create can optionally provide a key-value store
-// accessible across nodes and/or processes. KV store access may be necessary to
-// create some multi-node/multi-process clients. The caller can provide the two
-// callbacks below to access the key-value store.
+// accessible across nodes and/or processes. KV store access may be necessary
+// to create some multi-node/multi-process clients. The caller can provide the
+// two callbacks below to access the key-value store.
 
 // A callback to delete the value returned by PJRT_KeyValueGetCallback.
 typedef void (*PJRT_KeyValueGetCallback_ValueDeleter)(char* value);
 
 struct PJRT_KeyValueGetCallback_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   const char* key;
   size_t key_size;
   int timeout_in_ms;
@@ -321,9 +377,38 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_KeyValueGetCallback_Args,
 typedef PJRT_Error* (*PJRT_KeyValueGetCallback)(
     PJRT_KeyValueGetCallback_Args* args);
 
+// Same as KeyValueGet, but returns `NotFoundError` immediately if the key is
+// not found.
+typedef void (*PJRT_KeyValueTryGetCallback_ValueDeleter)(char* value);
+
+struct PJRT_KeyValueTryGetCallback_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  const char* key;
+  size_t key_size;
+  PJRT_CallbackError* callback_error;
+  void* user_arg;
+  char* value;        // out
+  size_t value_size;  // out
+  // The caller needs to set a PJRT_KeyValueTryGetCallback_ValueDeleter to
+  // delete the value returned by PJRT_KeyValueTryGetCallback. The
+  // implementation is responsible for copying `value` and then calling
+  // value_deleter_callback.
+  PJRT_KeyValueTryGetCallback_ValueDeleter value_deleter_callback;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_KeyValueTryGetCallback_Args,
+                          value_deleter_callback);
+
+// Requirements for PJRT_KeyValueTryGetCallback implementation: (1) Thread-safe.
+// (2) The caller that provides the two callbacks is responsible for avoiding
+// key collisions between different users of key-value store (i.e. between
+// different plugins, but not between different nodes in one plugin).
+typedef PJRT_Error* (*PJRT_KeyValueTryGetCallback)(
+    PJRT_KeyValueTryGetCallback_Args* args);
+
 struct PJRT_KeyValuePutCallback_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   const char* key;
   size_t key_size;
   // Only needs to stay alive for the duration of the PJRT_KeyValuePutCallback
@@ -344,7 +429,7 @@ typedef PJRT_Error* (*PJRT_KeyValuePutCallback)(
 
 struct PJRT_Client_Create_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   // Extra platform-specific options to create a client.
   const PJRT_NamedValue* create_options;
   size_t num_options;
@@ -359,15 +444,22 @@ struct PJRT_Client_Create_Args {
   void* kv_put_user_arg;
 
   PJRT_Client* client;  // out
+
+  // Key-value try-get callback provided by the caller of PJRT_Client_Create.
+  // Same as key-value get callback, but returns `NotFoundError` immediately if
+  // the key is not found.
+  PJRT_KeyValueTryGetCallback kv_try_get_callback;
+  // Will be passed to `kv_try_get_callback` as `user_arg` argument.
+  void* kv_try_get_user_arg;
 };
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_Create_Args, client);
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_Create_Args, kv_try_get_user_arg);
 
 // Creates and initializes a new PJRT_Client and returns in `client`.
 typedef PJRT_Error* PJRT_Client_Create(PJRT_Client_Create_Args* args);
 
 struct PJRT_Client_Destroy_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Client* client;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_Destroy_Args, client);
@@ -377,7 +469,7 @@ typedef PJRT_Error* PJRT_Client_Destroy(PJRT_Client_Destroy_Args* args);
 
 struct PJRT_Client_PlatformName_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Client* client;
   // `platform_name` has the same lifetime as `client`. It is owned by `client`.
   const char* platform_name;  // out
@@ -391,7 +483,7 @@ typedef PJRT_Error* PJRT_Client_PlatformName(
 
 struct PJRT_Client_ProcessIndex_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Client* client;
   int process_index;  // out
 };
@@ -404,7 +496,7 @@ typedef PJRT_Error* PJRT_Client_ProcessIndex(
 
 struct PJRT_Client_PlatformVersion_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Client* client;
   // `platform_version` has the same lifetime as `client`. It's owned by
   // `client`.
@@ -421,7 +513,7 @@ typedef PJRT_Error* PJRT_Client_PlatformVersion(
 
 struct PJRT_Client_TopologyDescription_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Client* client;
   // Is owned by and has the same lifetime as `client`.
   PJRT_TopologyDescription* topology;  // out
@@ -435,7 +527,7 @@ typedef PJRT_Error* PJRT_Client_TopologyDescription(
 
 struct PJRT_Client_Devices_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Client* client;
   PJRT_Device* const* devices;  // out
   size_t num_devices;           // out
@@ -448,7 +540,7 @@ typedef PJRT_Error* PJRT_Client_Devices(PJRT_Client_Devices_Args* args);
 
 struct PJRT_Client_AddressableDevices_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Client* client;
   PJRT_Device* const* addressable_devices;  // out
   size_t num_addressable_devices;           // out
@@ -464,7 +556,7 @@ typedef PJRT_Error* PJRT_Client_AddressableDevices(
 
 struct PJRT_Client_LookupDevice_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Client* client;
   int id;
   // `device` has the same lifetime as `client`. It is owned by `client`.
@@ -479,7 +571,7 @@ typedef PJRT_Error* PJRT_Client_LookupDevice(
 
 struct PJRT_Client_LookupAddressableDevice_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Client* client;
   int local_hardware_id;
   // `addressable_device` has the same lifetime as `client`. It is owned by
@@ -494,9 +586,49 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_LookupAddressableDevice_Args,
 typedef PJRT_Error* PJRT_Client_LookupAddressableDevice(
     PJRT_Client_LookupAddressableDevice_Args* args);
 
+typedef enum {
+  PJRT_ProcessState_kUnspecified = 0,
+  PJRT_ProcessState_kUninitialized = 1,
+  PJRT_ProcessState_kDisconnected = 2,
+  PJRT_ProcessState_kConnected = 3,
+  PJRT_ProcessState_kError = 4,
+} PJRT_ProcessState;
+
+// TODO: mwhittaker - Add the remaining fields from
+// tensorflow::CoordinatedTaskStateInfo.
+struct PJRT_ProcessInfo {
+  size_t struct_size;
+  int task_id;
+  uint64_t incarnation_id;
+  PJRT_ProcessState state;
+  int error_code;
+  const char* error_message;
+  size_t error_message_size;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_ProcessInfo, error_message_size);
+
+struct PJRT_Client_UpdateGlobalProcessInfo_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+  PJRT_ProcessInfo* process_infos;
+  size_t num_process_infos;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_UpdateGlobalProcessInfo_Args,
+                          num_process_infos);
+
+// Updates the PjRt client with information about all global processes.
+//
+// Recall that a distributed program may consist of multiple PjRt clients
+// spanning multiple machines. These clients perform collective operations, like
+// AllGather, to execute a distributed program. UpdateGlobalProcessInfo updates
+// a PjRt client with information about all processes.
+typedef PJRT_Error* PJRT_Client_UpdateGlobalProcessInfo(
+    PJRT_Client_UpdateGlobalProcessInfo_Args* args);
+
 struct PJRT_Client_AddressableMemories_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Client* client;
   PJRT_Memory* const* addressable_memories;  // out
   size_t num_addressable_memories;           // out
@@ -512,7 +644,7 @@ typedef PJRT_Error* PJRT_Client_AddressableMemories(
 
 struct PJRT_Program {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   // Serialized code in the specified format below.
   // String is owned by the caller.
   char* code;  // in/out depending on usage
@@ -529,14 +661,13 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Program, format_size);
 
 struct PJRT_Client_Compile_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Client* client;
   // Only needs to stay alive for the duration of the Compile call.
   // `program->format` and `program->format_size` are owned by the caller.
   const PJRT_Program* program;
   // TODO(b/240560013): consider putting some of option fields in priv.
-  // Serialized CompileOptionsProto
-  // (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/pjrt/compile_options.proto)
+  // Serialized CompileOptionsProto.
   const char* compile_options;
   size_t compile_options_size;
   PJRT_LoadedExecutable* executable;  // out
@@ -549,7 +680,7 @@ typedef PJRT_Error* PJRT_Client_Compile(PJRT_Client_Compile_Args* args);
 
 struct PJRT_Client_DefaultDeviceAssignment_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Client* client;
   int num_replicas;
   int num_partitions;
@@ -566,6 +697,129 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_DefaultDeviceAssignment_Args,
 typedef PJRT_Error* PJRT_Client_DefaultDeviceAssignment(
     PJRT_Client_DefaultDeviceAssignment_Args* args);
 
+struct PJRT_Client_DmaMap_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+  void* data;
+  size_t size;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_DmaMap_Args, size);
+
+typedef PJRT_Error* PJRT_Client_DmaMap(PJRT_Client_DmaMap_Args* args);
+
+struct PJRT_Client_DmaUnmap_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+  void* data;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_DmaUnmap_Args, data);
+
+typedef PJRT_Error* PJRT_Client_DmaUnmap(PJRT_Client_DmaUnmap_Args* args);
+
+struct PJRT_AsyncHostToDeviceTransferManager_Destroy_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_AsyncHostToDeviceTransferManager* transfer_manager;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_AsyncHostToDeviceTransferManager_Destroy_Args,
+                          transfer_manager);
+
+// Frees `transfer_manager`. `transfer_manager` can be nullptr.
+typedef PJRT_Error* PJRT_AsyncHostToDeviceTransferManager_Destroy(
+    PJRT_AsyncHostToDeviceTransferManager_Destroy_Args* args);
+
+struct PJRT_AsyncHostToDeviceTransferManager_TransferData_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_AsyncHostToDeviceTransferManager* transfer_manager;
+  int buffer_index;
+  const void* data;
+  int64_t offset;
+  int64_t transfer_size;
+  bool is_last_transfer;
+  PJRT_Event* done_with_h2d_transfer;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(
+    PJRT_AsyncHostToDeviceTransferManager_TransferData_Args,
+    done_with_h2d_transfer);
+typedef PJRT_Error* PJRT_AsyncHostToDeviceTransferManager_TransferData(
+    PJRT_AsyncHostToDeviceTransferManager_TransferData_Args* args);
+
+struct PJRT_AsyncHostToDeviceTransferManager_RetrieveBuffer_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_AsyncHostToDeviceTransferManager* transfer_manager;
+  int buffer_index;
+  PJRT_Buffer* buffer_out;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(
+    PJRT_AsyncHostToDeviceTransferManager_RetrieveBuffer_Args, buffer_out);
+typedef PJRT_Error* PJRT_AsyncHostToDeviceTransferManager_RetrieveBuffer(
+    PJRT_AsyncHostToDeviceTransferManager_RetrieveBuffer_Args* args);
+
+struct PJRT_AsyncHostToDeviceTransferManager_Device_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_AsyncHostToDeviceTransferManager* transfer_manager;
+  PJRT_Device* device_out;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_AsyncHostToDeviceTransferManager_Device_Args,
+                          device_out);
+typedef PJRT_Error* PJRT_AsyncHostToDeviceTransferManager_Device(
+    PJRT_AsyncHostToDeviceTransferManager_Device_Args* args);
+
+struct PJRT_AsyncHostToDeviceTransferManager_BufferCount_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_AsyncHostToDeviceTransferManager* transfer_manager;
+  size_t buffer_count;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(
+    PJRT_AsyncHostToDeviceTransferManager_BufferCount_Args, buffer_count);
+typedef PJRT_Error* PJRT_AsyncHostToDeviceTransferManager_BufferCount(
+    PJRT_AsyncHostToDeviceTransferManager_BufferCount_Args* args);
+
+struct PJRT_AsyncHostToDeviceTransferManager_BufferSize_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_AsyncHostToDeviceTransferManager* transfer_manager;
+  int buffer_index;
+  size_t buffer_size;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_AsyncHostToDeviceTransferManager_BufferSize_Args,
+                          buffer_size);
+typedef PJRT_Error* PJRT_AsyncHostToDeviceTransferManager_BufferSize(
+    PJRT_AsyncHostToDeviceTransferManager_BufferSize_Args* args);
+
+struct PJRT_AsyncHostToDeviceTransferManager_SetBufferError_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_AsyncHostToDeviceTransferManager* transfer_manager;
+  int buffer_index;
+  PJRT_Error_Code error_code;
+  const char* error_message;
+  size_t error_message_size;
+};
+PJRT_DEFINE_STRUCT_TRAITS(
+    PJRT_AsyncHostToDeviceTransferManager_SetBufferError_Args,
+    error_message_size);
+typedef PJRT_Error* PJRT_AsyncHostToDeviceTransferManager_SetBufferError(
+    PJRT_AsyncHostToDeviceTransferManager_SetBufferError_Args* args);
+
+struct PJRT_AsyncHostToDeviceTransferManager_AddMetadata_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_AsyncHostToDeviceTransferManager* transfer_manager;
+  const PJRT_NamedValue* transfer_metadata;
+  size_t num_metadata;
+};
+PJRT_DEFINE_STRUCT_TRAITS(
+    PJRT_AsyncHostToDeviceTransferManager_AddMetadata_Args, num_metadata);
+typedef PJRT_Error* PJRT_AsyncHostToDeviceTransferManager_AddMetadata(
+    PJRT_AsyncHostToDeviceTransferManager_AddMetadata_Args* args);
+
 typedef enum {
   // Invalid primitive type to serve as default.
   PJRT_Buffer_Type_INVALID,
@@ -612,6 +866,20 @@ typedef enum {
   // 4-bit integer types
   PJRT_Buffer_Type_S4,
   PJRT_Buffer_Type_U4,
+
+  PJRT_Buffer_Type_TOKEN,
+
+  // 2-bit integer types
+  PJRT_Buffer_Type_S2,
+  PJRT_Buffer_Type_U2,
+
+  // More truncated 8 bit floating-point formats.
+  PJRT_Buffer_Type_F8E4M3,
+  PJRT_Buffer_Type_F8E3M4,
+  PJRT_Buffer_Type_F8E8M0FNU,
+
+  // 4-bit MX floating-point format.
+  PJRT_Buffer_Type_F4E2M1FN,
 } PJRT_Buffer_Type;
 
 typedef enum {
@@ -629,11 +897,24 @@ typedef enum {
   PJRT_HostBufferSemantics_kImmutableUntilTransferCompletes,
 
   // The PjRtBuffer may alias `data` internally and the runtime may use the
-  // `data` contents as long as the buffer is alive. The caller promises to
-  // keep `data` alive and not to mutate its contents as long as the buffer is
-  // alive; to notify the caller that the buffer may be freed, the runtime
-  // will call `done_with_host_buffer` when the PjRtBuffer is freed.
-  PJRT_HostBufferSemantics_kZeroCopy,
+  // `data` contents as long as the buffer is alive. The runtime promises not
+  // to mutate contents of the buffer (i.e. it will not use it for aliased
+  // output buffers). The caller promises to keep `data` alive and not to mutate
+  // its contents as long as the buffer is alive; to notify the caller that the
+  // buffer may be freed, the runtime will call `done_with_host_buffer` when the
+  // PjRtBuffer is freed.
+  PJRT_HostBufferSemantics_kImmutableZeroCopy,
+
+  // The PjRtBuffer may alias `data` internally and the runtime may use the
+  // `data` contents as long as the buffer is alive. The runtime is allowed
+  // to mutate contents of the buffer (i.e. use it for aliased output
+  // buffers). The caller promises to keep `data` alive and not to mutate its
+  // contents as long as the buffer is alive (otherwise it could be a data
+  // race with the runtime); to notify the caller that the buffer may be
+  // freed, the runtime will call `on_done_with_host_buffer` when the
+  // PjRtBuffer is freed. On non-CPU platforms this acts identically to
+  // kImmutableUntilTransferCompletes.
+  PJRT_HostBufferSemantics_kMutableZeroCopy,
 } PJRT_HostBufferSemantics;
 
 typedef enum {
@@ -643,7 +924,7 @@ typedef enum {
 
 struct PJRT_Buffer_MemoryLayout_Tiled {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   // A map from physical dimension numbers to logical dimension numbers.
   // The first element is the most minor physical dimension (fastest varying
   // index) and the last the most major (slowest varying index). The contents of
@@ -661,7 +942,7 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_MemoryLayout_Tiled, num_tiles);
 
 struct PJRT_Buffer_MemoryLayout_Strides {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   // Number of bytes to traverse per dimension. Must be the same size as
   // the number of dimensions of the data. Caution: `byte_strides` are allowed
   // to be negative, in which case data may need to point to the interior of
@@ -676,7 +957,7 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_MemoryLayout_Strides, num_byte_strides);
 // strides.
 struct PJRT_Buffer_MemoryLayout {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   union {
     PJRT_Buffer_MemoryLayout_Tiled tiled;
     PJRT_Buffer_MemoryLayout_Strides strides;
@@ -685,9 +966,76 @@ struct PJRT_Buffer_MemoryLayout {
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_MemoryLayout, type);
 
+struct PJRT_Client_CreateUninitializedBuffer_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+
+  // Shape fields.
+  const int64_t* shape_dims;
+  size_t shape_num_dims;
+  PJRT_Buffer_Type shape_element_type;
+  PJRT_Buffer_MemoryLayout* shape_layout;
+
+  // Device to copy host data to.
+  PJRT_Device* device;
+
+  // If nullptr, host data will be copied to `device`, otherwise we copy data to
+  // `memory`.
+  PJRT_Memory* memory;
+
+  // Output device buffer. The caller is responsible for calling
+  // PJRT_Buffer_Destroy.
+  PJRT_Buffer* buffer;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_CreateUninitializedBuffer_Args, buffer);
+
+typedef PJRT_Error* PJRT_Client_CreateUninitializedBuffer(
+    PJRT_Client_CreateUninitializedBuffer_Args* args);
+
+struct PJRT_Client_CreateAliasBuffer_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+
+  // Destination memory space for the buffer alias.
+  PJRT_Memory* memory;
+
+  // Shape fields.
+  const int64_t* shape_dims;
+  size_t shape_num_dims;
+  PJRT_Buffer_Type shape_element_type;
+  PJRT_Buffer_MemoryLayout* shape_layout;
+
+  PJRT_Buffer* alias_buffer;                                 // out
+  PJRT_FulfillAliasBufferCallback* fulfill_alias_buffer_cb;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_CreateAliasBuffer_Args,
+                          fulfill_alias_buffer_cb);
+
+typedef PJRT_Error* PJRT_Client_CreateAliasBuffer(
+    PJRT_Client_CreateAliasBuffer_Args* args);
+
+struct PJRT_Client_FulfillAliasBuffer_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+
+  PJRT_Buffer* buffer;                                       // in
+  PJRT_Error_Code status_code;                               // in
+  const char* error_message;                                 // in
+  size_t error_message_size;                                 // in
+  PJRT_FulfillAliasBufferCallback* fulfill_alias_buffer_cb;  // in
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_FulfillAliasBuffer_Args,
+                          fulfill_alias_buffer_cb);
+
+typedef PJRT_Error* PJRT_Client_FulfillAliasBuffer(
+    PJRT_Client_FulfillAliasBuffer_Args* args);
+
 struct PJRT_Client_BufferFromHostBuffer_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Client* client;
   // Pointer to the host buffer
   const void* data;
@@ -735,7 +1083,7 @@ typedef PJRT_Error* PJRT_Client_BufferFromHostBuffer(
 
 struct PJRT_Client_CreateViewOfDeviceBuffer_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Client* client;
   // A pointer to a non-owned device buffer. A PJRT_Buffer that is a non-owned
   // view of this device buffer will be created.
@@ -744,7 +1092,9 @@ struct PJRT_Client_CreateViewOfDeviceBuffer_Args {
   size_t num_dims;
   PJRT_Buffer_Type element_type;
   PJRT_Buffer_MemoryLayout* layout;
-  // The device that `device_buffer_ptr` is on.
+  // The device that `device_buffer_ptr` is on. The argument is ignored if
+  // `memory` is provided.
+  // DEPRECATED: Use `memory` instead.
   PJRT_Device* device;
   // A callback to be performed when the PJRT_Buffer is done with the on-device
   // buffer. This callback is optional and can be a nullptr.
@@ -760,8 +1110,10 @@ struct PJRT_Client_CreateViewOfDeviceBuffer_Args {
   // to be supported on all hardware platforms.
   intptr_t stream;
   PJRT_Buffer* buffer;  // out
+  // The memory space that `device_buffer_ptr` is in.
+  PJRT_Memory* memory;
 };
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_CreateViewOfDeviceBuffer_Args, buffer);
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_CreateViewOfDeviceBuffer_Args, memory);
 
 // Creates a PJRT buffer that is a non-owned view of an on-device buffer
 // (typically allocated by another library). The buffer may be mutated,
@@ -770,6 +1122,31 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_CreateViewOfDeviceBuffer_Args, buffer);
 typedef PJRT_Error* PJRT_Client_CreateViewOfDeviceBuffer(
     PJRT_Client_CreateViewOfDeviceBuffer_Args* args);
 
+struct PJRT_ShapeSpec {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  const int64_t* dims;
+  size_t num_dims;
+  PJRT_Buffer_Type element_type;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_ShapeSpec, element_type);
+
+struct PJRT_Client_CreateBuffersForAsyncHostToDevice_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+  PJRT_ShapeSpec* shape_specs;
+  size_t num_shape_specs;
+  PJRT_Buffer_MemoryLayout** device_layouts;  // optional
+  size_t num_device_layouts;
+  PJRT_Memory* memory;
+  PJRT_AsyncHostToDeviceTransferManager* transfer_manager;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_CreateBuffersForAsyncHostToDevice_Args,
+                          transfer_manager);
+typedef PJRT_Error* PJRT_Client_CreateBuffersForAsyncHostToDevice(
+    PJRT_Client_CreateBuffersForAsyncHostToDevice_Args* args);
+
 // -------------------------- Device Descriptions ------------------------------
 
 // Device descriptions may be associated with an actual device
@@ -781,7 +1158,7 @@ typedef PJRT_Error* PJRT_Client_CreateViewOfDeviceBuffer(
 
 struct PJRT_DeviceDescription_Id_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_DeviceDescription* device_description;
   int id;  // out
 };
@@ -795,7 +1172,7 @@ typedef PJRT_Error* PJRT_DeviceDescription_Id(
 
 struct PJRT_DeviceDescription_ProcessIndex_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_DeviceDescription* device_description;
   int process_index;  // out
 };
@@ -812,7 +1189,7 @@ typedef PJRT_Error* PJRT_DeviceDescription_ProcessIndex(
 
 struct PJRT_DeviceDescription_Attributes_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_DeviceDescription* device_description;
   size_t num_attributes;              // out
   const PJRT_NamedValue* attributes;  // out
@@ -826,7 +1203,7 @@ typedef PJRT_Error* PJRT_DeviceDescription_Attributes(
 
 struct PJRT_DeviceDescription_Kind_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_DeviceDescription* device_description;
   // `device_kind` string is owned by `device` and has same lifetime as
   // `device`.
@@ -842,7 +1219,7 @@ typedef PJRT_Error* PJRT_DeviceDescription_Kind(
 
 struct PJRT_DeviceDescription_DebugString_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_DeviceDescription* device_description;
   const char* debug_string;  // out
   size_t debug_string_size;  // out
@@ -857,7 +1234,7 @@ typedef PJRT_Error* PJRT_DeviceDescription_DebugString(
 
 struct PJRT_DeviceDescription_ToString_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_DeviceDescription* device_description;
   const char* to_string;  // out
   size_t to_string_size;  // out
@@ -873,7 +1250,7 @@ typedef PJRT_Error* PJRT_DeviceDescription_ToString(
 
 struct PJRT_Device_GetDescription_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Device* device;
   PJRT_DeviceDescription* device_description;  // out
 };
@@ -885,7 +1262,7 @@ typedef PJRT_Error* PJRT_Device_GetDescription(
 
 struct PJRT_Device_IsAddressable_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Device* device;
   bool is_addressable;  // out
 };
@@ -897,7 +1274,7 @@ typedef PJRT_Error* PJRT_Device_IsAddressable(
 
 struct PJRT_Device_LocalHardwareId_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Device* device;
   int local_hardware_id;  // out
 };
@@ -910,13 +1287,13 @@ typedef PJRT_Error* PJRT_Device_LocalHardwareId(
 
 struct PJRT_Device_AddressableMemories_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Device* device;
   // Has the lifetime of `device`.
   PJRT_Memory* const* memories;  // out
   size_t num_memories;           // out
 };
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_Device_AddressableMemories_Args, memories);
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Device_AddressableMemories_Args, num_memories);
 
 // Returns the memories that a device can address.
 typedef PJRT_Error* PJRT_Device_AddressableMemories(
@@ -924,7 +1301,7 @@ typedef PJRT_Error* PJRT_Device_AddressableMemories(
 
 struct PJRT_Device_DefaultMemory_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Device* device;
   // `memory` has the same lifetime as `device`.
   PJRT_Memory* memory;  // out
@@ -938,7 +1315,7 @@ typedef PJRT_Error* PJRT_Device_DefaultMemory(
 
 struct PJRT_Device_MemoryStats_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Device* device;
 
   // Number of bytes in use.
@@ -989,7 +1366,7 @@ typedef PJRT_Error* PJRT_Device_MemoryStats(PJRT_Device_MemoryStats_Args* args);
 
 struct PJRT_Memory_Id_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Memory* memory;
   int id;  // out
 };
@@ -1000,20 +1377,31 @@ typedef PJRT_Error* PJRT_Memory_Id(PJRT_Memory_Id_Args* args);
 
 struct PJRT_Memory_Kind_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Memory* memory;
   // `memory_kind` has same lifetime as `memory`.
-  const char* memory_kind;  // out
-  size_t memory_kind_size;  // out
+  const char* kind;  // out
+  size_t kind_size;  // out
 };
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_Memory_Kind_Args, memory_kind_size);
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Memory_Kind_Args, kind_size);
 
 // A platform-dependent string that uniquely identifies the kind of the memory.
 typedef PJRT_Error* PJRT_Memory_Kind(PJRT_Memory_Kind_Args* args);
 
+struct PJRT_Memory_Kind_Id_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Memory* memory;
+  int kind_id;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Memory_Kind_Id_Args, kind_id);
+
+// A platform-dependent ID that uniquely identifies the kind of the memory.
+typedef PJRT_Error* PJRT_Memory_Kind_Id(PJRT_Memory_Kind_Id_Args* args);
+
 struct PJRT_Memory_DebugString_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Memory* memory;
   const char* debug_string;  // out
   size_t debug_string_size;  // out
@@ -1026,7 +1414,7 @@ typedef PJRT_Error* PJRT_Memory_DebugString(PJRT_Memory_DebugString_Args* args);
 
 struct PJRT_Memory_ToString_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Memory* memory;
   const char* to_string;  // out
   size_t to_string_size;  // out
@@ -1038,7 +1426,7 @@ typedef PJRT_Error* PJRT_Memory_ToString(PJRT_Memory_ToString_Args* args);
 
 struct PJRT_Memory_AddressableByDevices_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Memory* memory;
   PJRT_Device* const* devices;  // out
   size_t num_devices;           // out
@@ -1049,11 +1437,41 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Memory_AddressableByDevices_Args, num_devices);
 typedef PJRT_Error* PJRT_Memory_AddressableByDevices(
     PJRT_Memory_AddressableByDevices_Args* args);
 
+// ------------------------------- Execute Context -----------------------------
+
+// An opaque context passed to an execution that may be used to supply
+// additional arguments to a derived class of PJRT_Executable. It is a caller
+// responsibility to ensure that the context is valid for the duration of the
+// execution.
+typedef struct PJRT_ExecuteContext PJRT_ExecuteContext;
+
+struct PJRT_ExecuteContext_Create_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_ExecuteContext* context;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_ExecuteContext_Create_Args, context);
+
+// Creates an execute context.
+typedef PJRT_Error* PJRT_ExecuteContext_Create(
+    PJRT_ExecuteContext_Create_Args* args);
+
+struct PJRT_ExecuteContext_Destroy_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_ExecuteContext* context;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_ExecuteContext_Destroy_Args, context);
+
+// Frees an execute context. `context` can be nullptr.
+typedef PJRT_Error* PJRT_ExecuteContext_Destroy(
+    PJRT_ExecuteContext_Destroy_Args* args);
+
 // ------------------------------- Executables ---------------------------------
 
 struct PJRT_Executable_Destroy_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Executable* executable;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Executable_Destroy_Args, executable);
@@ -1063,7 +1481,7 @@ typedef PJRT_Error* PJRT_Executable_Destroy(PJRT_Executable_Destroy_Args* args);
 
 struct PJRT_LoadedExecutable_Destroy_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_LoadedExecutable* executable;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_LoadedExecutable_Destroy_Args, executable);
@@ -1075,7 +1493,7 @@ typedef PJRT_Error* PJRT_LoadedExecutable_Destroy(
 
 struct PJRT_LoadedExecutable_GetExecutable_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_LoadedExecutable* loaded_executable;
   PJRT_Executable* executable;  // out
 };
@@ -1086,9 +1504,38 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_LoadedExecutable_GetExecutable_Args, executable);
 typedef PJRT_Error* PJRT_LoadedExecutable_GetExecutable(
     PJRT_LoadedExecutable_GetExecutable_Args* args);
 
+typedef struct PJRT_DeviceAssignmentSerialized PJRT_DeviceAssignmentSerialized;
+
+struct PJRT_LoadedExecutable_GetDeviceAssignment_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_LoadedExecutable* executable;
+
+  // Lives only as long as serialized_device_assignment
+  const char* serialized_bytes;  // out
+  size_t serialized_bytes_size;  // out
+
+  PJRT_DeviceAssignmentSerialized*
+      serialized_device_assignment;  // backs serialized_bytes.
+  // cleanup fn must be called to free the backing memory for serialized_bytes.
+  // Should only be called once on serialized_device_assignment.
+  void (*serialized_device_assignment_deleter)(
+      PJRT_DeviceAssignmentSerialized* da);  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_LoadedExecutable_GetDeviceAssignment_Args,
+                          serialized_device_assignment_deleter);
+
+// Retrieves the serialized DeviceAssignmentProto for a given
+// PJRT_LoadedExecutable. The implementation allocates the serialized data,
+// which is valid as long as `serialized_device_assignment` is alive. The
+// caller must call `serialized_device_assignment_deleter` to free the
+// backing memory.
+typedef PJRT_Error* PJRT_LoadedExecutable_GetDeviceAssignment(
+    PJRT_LoadedExecutable_GetDeviceAssignment_Args* args);
+
 struct PJRT_Executable_Name_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Executable* executable;
   // `executable_name` has the same lifetime as `executable`. It is owned by
   // `executable`.
@@ -1103,7 +1550,7 @@ typedef PJRT_Error* PJRT_Executable_Name(PJRT_Executable_Name_Args* args);
 // TODO(b/269178731): Revisit whether num_replicas is needed.
 struct PJRT_Executable_NumReplicas_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Executable* executable;
   size_t num_replicas;  // out
 };
@@ -1115,7 +1562,7 @@ typedef PJRT_Error* PJRT_Executable_NumReplicas(
 
 struct PJRT_Executable_NumPartitions_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Executable* executable;
   size_t num_partitions;  // out
 };
@@ -1127,7 +1574,7 @@ typedef PJRT_Error* PJRT_Executable_NumPartitions(
 
 struct PJRT_LoadedExecutable_AddressableDevices_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_LoadedExecutable* executable;
   PJRT_Device* const* addressable_devices;  // out
   size_t num_addressable_devices;           // out
@@ -1141,7 +1588,7 @@ typedef PJRT_Error* PJRT_LoadedExecutable_AddressableDevices(
 
 struct PJRT_Executable_OptimizedProgram_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Executable* executable;
   PJRT_Program* program;  // out, but read below
 };
@@ -1175,7 +1622,7 @@ typedef PJRT_Error* PJRT_Executable_OptimizedProgram(
 
 struct PJRT_LoadedExecutable_Delete_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_LoadedExecutable* executable;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_LoadedExecutable_Delete_Args, executable);
@@ -1190,7 +1637,7 @@ typedef PJRT_Error* PJRT_LoadedExecutable_Delete(
 
 struct PJRT_LoadedExecutable_IsDeleted_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_LoadedExecutable* executable;
   bool is_deleted;  // out
 };
@@ -1247,7 +1694,7 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_RecvCallbackInfo, recv_callback);
 
 struct PJRT_ExecuteOptions {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   // Callbacks for when send/recv ops are executed. The outer lists correspond
   // to each device returned by `PJRT_Executable_AddressableDevices` for
   // `executable` (i.e. they will have length `num_devices`). Each inner list
@@ -1274,12 +1721,31 @@ struct PJRT_ExecuteOptions {
   // during the call.
   const int64_t* non_donatable_input_indices;
   size_t num_non_donatable_input_indices;
-};
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_ExecuteOptions, launch_id);
+  PJRT_ExecuteContext* context;
+  // The `call_location` field is used to pass down call site location
+  // information from higher-level frameworks like JAX and PyTorch to the PJRT
+  // plugin. This field stores the source location (e.g., file:line) of the
+  // Python code that triggered the execution of this compiled program. This
+  // differs from the source location metadata stored in `OpMetadata`, which
+  // refers to the origin of individual operations within the HLO module.
+  // The plugin can use `call_location` for debugging and error reporting,
+  // allowing users to pinpoint which program execution led to an issue.
+  // The `call_location` pointer is owned by the caller and must point to a
+  // null-terminated string. It is only valid for the duration of the C API
+  // call. The plugin must copy the string if it needs to be stored.
+  const char* call_location;
+
+  // The incarnation id for every task. For every 0 <= i < num_tasks,
+  // task task_ids[i] has incarnation incarnation_ids[i].
+  size_t num_tasks;
+  int* task_ids;
+  int64_t* incarnation_ids;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_ExecuteOptions, incarnation_ids);
 
 struct PJRT_LoadedExecutable_Execute_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_LoadedExecutable* executable;
   // Only needs to stay alive for the duration of the Execute call.
   PJRT_ExecuteOptions* options;
@@ -1318,7 +1784,7 @@ typedef PJRT_Error* PJRT_LoadedExecutable_Execute(
 
 struct PJRT_Executable_NumOutputs_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Executable* executable;
   size_t num_outputs;  // out
 };
@@ -1330,7 +1796,7 @@ typedef PJRT_Error* PJRT_Executable_NumOutputs(
 
 struct PJRT_Executable_SizeOfGeneratedCodeInBytes_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Executable* executable;
   int64_t size_in_bytes;  // out
 };
@@ -1342,7 +1808,7 @@ typedef PJRT_Error* PJRT_Executable_SizeOfGeneratedCodeInBytes(
 
 struct PJRT_Executable_Fingerprint_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Executable* executable;
   // Has the lifetime of `executable`
   const char* executable_fingerprint;  // out
@@ -1360,7 +1826,7 @@ typedef PJRT_Error* PJRT_Executable_Fingerprint(
 
 struct PJRT_Executable_GetCostAnalysis_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Executable* executable;
   size_t num_properties;  // out
   // `properties` and any embedded data are owned by and have the same lifetime
@@ -1378,28 +1844,40 @@ typedef PJRT_Error* PJRT_Executable_GetCostAnalysis(
 
 struct PJRT_Executable_GetCompiledMemoryStats_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Executable* executable;
 
   // Mirrors xla::CompiledMemoryStats.
+  // Device default memory (e.g., HBM for GPU/TPU) usage stats.
   int64_t generated_code_size_in_bytes;  // out
   int64_t argument_size_in_bytes;        // out
   int64_t output_size_in_bytes;          // out
   // How much argument is reused for output.
   int64_t alias_size_in_bytes;  // out
   int64_t temp_size_in_bytes;   // out
+
+  // Host memory usage stats.
+  int64_t host_generated_code_size_in_bytes;  // out
+  int64_t host_argument_size_in_bytes;        // out
+  int64_t host_output_size_in_bytes;          // out
+  int64_t host_alias_size_in_bytes;           // out
+  int64_t host_temp_size_in_bytes;            // out
+
+  // Device memory stats, from xla::CompiledMemoryStats.
+  int64_t peak_memory_in_bytes;  // out
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Executable_GetCompiledMemoryStats_Args,
-                          temp_size_in_bytes);
+                          peak_memory_in_bytes);
 
-// Return memory stats that allow callers to estimate device memory usage
-// when running this executable.
+// Return memory stats that allow callers to estimate memory usage when running
+// this executable. The memory stats could contain usage info from different
+// memory spaces, like default memory (e.g., HBM for GPU/TPU) and host memory.
 typedef PJRT_Error* PJRT_Executable_GetCompiledMemoryStats(
     PJRT_Executable_GetCompiledMemoryStats_Args* args);
 
 struct PJRT_Executable_OutputElementTypes_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Executable* executable;
   PJRT_Buffer_Type* output_types;  // out
   size_t num_output_types;         // out
@@ -1413,7 +1891,7 @@ typedef PJRT_Error* PJRT_Executable_OutputElementTypes(
 
 struct PJRT_Executable_OutputDimensions_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Executable* executable;
   size_t num_outputs;
   // Has length: sum of all elements in the list `dim_sizes`.
@@ -1431,7 +1909,7 @@ typedef PJRT_Error* PJRT_Executable_OutputDimensions(
 
 struct PJRT_Executable_OutputMemoryKinds_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Executable* executable;
   size_t num_outputs;
   // Has length `num_outputs`.
@@ -1450,7 +1928,7 @@ typedef struct PJRT_SerializedExecutable PJRT_SerializedExecutable;
 
 struct PJRT_Executable_Serialize_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   const PJRT_Executable* executable;
 
   // Lives only as long as serialized_executable
@@ -1473,14 +1951,18 @@ typedef PJRT_Error* PJRT_Executable_Serialize(
 
 struct PJRT_Executable_DeserializeAndLoad_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Client* client;
   const char* serialized_executable;
   size_t serialized_executable_size;
   PJRT_LoadedExecutable* loaded_executable;  // out
+  // Serialized CompileOptionsProto or null (to use the options
+  // from the serialized executable).
+  const char* overridden_serialized_compile_options;
+  size_t overridden_serialized_compile_options_size;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Executable_DeserializeAndLoad_Args,
-                          loaded_executable);
+                          overridden_serialized_compile_options_size);
 
 // Deserializes an executable serialized by `PJRT_Executable_Serialize`.
 // `serialized_executable` must have been produced by the same platform and
@@ -1490,7 +1972,7 @@ typedef PJRT_Error* PJRT_Executable_DeserializeAndLoad(
 
 struct PJRT_LoadedExecutable_Fingerprint_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_LoadedExecutable* executable;
   // Has the lifetime of `executable`
   const char* executable_fingerprint;  // out
@@ -1510,7 +1992,7 @@ typedef PJRT_Error* PJRT_LoadedExecutable_Fingerprint(
 
 struct PJRT_Buffer_Destroy_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_Destroy_Args, buffer);
@@ -1521,7 +2003,7 @@ typedef PJRT_Error* PJRT_Buffer_Destroy(PJRT_Buffer_Destroy_Args* args);
 
 struct PJRT_Buffer_ElementType_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
   PJRT_Buffer_Type type;  // out
 };
@@ -1532,7 +2014,7 @@ typedef PJRT_Error* PJRT_Buffer_ElementType(PJRT_Buffer_ElementType_Args* args);
 
 struct PJRT_Buffer_Dimensions_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
   // Has the lifetime of `buffer` and length `num_dims`.
   const int64_t* dims;  // out
@@ -1545,7 +2027,7 @@ typedef PJRT_Error* PJRT_Buffer_Dimensions(PJRT_Buffer_Dimensions_Args* args);
 
 struct PJRT_Buffer_UnpaddedDimensions_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
   // Has the lifetime of `buffer` and length `num_dims`.
   const int64_t* unpadded_dims;  // out
@@ -1565,7 +2047,7 @@ typedef PJRT_Error* PJRT_Buffer_UnpaddedDimensions(
 
 struct PJRT_Buffer_DynamicDimensionIndices_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
   // Has the lifetime of `buffer` and length `num_dynamic_dims`.
   const size_t* dynamic_dim_indices;  // out
@@ -1583,20 +2065,22 @@ typedef PJRT_Error* PJRT_Buffer_DynamicDimensionIndices(
 
 struct PJRT_Buffer_GetMemoryLayout_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
   // Layout data is owned by and has the lifetime of `buffer`.
   PJRT_Buffer_MemoryLayout layout;  // out
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_GetMemoryLayout_Args, layout);
 
+// DEPRECATED. Please use layout extension instead.
+// https://github.com/openxla/xla/blob/main/xla/pjrt/c/pjrt_c_api_layouts_extension.h
 // Returns the memory layout of the data in this buffer.
 typedef PJRT_Error* PJRT_Buffer_GetMemoryLayout(
     PJRT_Buffer_GetMemoryLayout_Args* args);
 
 struct PJRT_Buffer_ToHostBuffer_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* src;
 
   // The caller can specify an optional host layout. If nullptr, the layout of
@@ -1622,7 +2106,7 @@ typedef PJRT_Error* PJRT_Buffer_ToHostBuffer(
 
 struct PJRT_Buffer_OnDeviceSizeInBytes_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
   size_t on_device_size_in_bytes;  // out
 };
@@ -1635,7 +2119,7 @@ typedef PJRT_Error* PJRT_Buffer_OnDeviceSizeInBytes(
 
 struct PJRT_Buffer_Delete_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_Delete_Args, buffer);
@@ -1649,7 +2133,7 @@ typedef PJRT_Error* PJRT_Buffer_Delete(PJRT_Buffer_Delete_Args* args);
 
 struct PJRT_Buffer_IsDeleted_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
   bool is_deleted;  // out
 };
@@ -1658,9 +2142,23 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_IsDeleted_Args, is_deleted);
 // True if and only if PJRT_Buffer_Delete has previously been called.
 typedef PJRT_Error* PJRT_Buffer_IsDeleted(PJRT_Buffer_IsDeleted_Args* args);
 
+struct PJRT_Buffer_CopyRawToHost_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Buffer* buffer;
+  void* dst;
+  int64_t offset;
+  int64_t transfer_size;
+  PJRT_Event* event;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_CopyRawToHost_Args, event);
+
+typedef PJRT_Error* PJRT_Buffer_CopyRawToHost(
+    PJRT_Buffer_CopyRawToHost_Args* args);
+
 struct PJRT_Buffer_CopyToDevice_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
   PJRT_Device* dst_device;
   PJRT_Buffer* dst_buffer;  // out
@@ -1675,7 +2173,7 @@ typedef PJRT_Error* PJRT_Buffer_CopyToDevice(
 
 struct PJRT_Buffer_CopyToMemory_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
   PJRT_Memory* dst_memory;
   PJRT_Buffer* dst_buffer;  // out
@@ -1690,7 +2188,7 @@ typedef PJRT_Error* PJRT_Buffer_CopyToMemory(
 
 struct PJRT_Buffer_IsOnCpu_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
   bool is_on_cpu;  // out
 };
@@ -1701,7 +2199,7 @@ typedef PJRT_Error* PJRT_Buffer_IsOnCpu(PJRT_Buffer_IsOnCpu_Args* args);
 
 struct PJRT_Buffer_Device_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
   PJRT_Device* device;  // out
 };
@@ -1712,7 +2210,7 @@ typedef PJRT_Error* PJRT_Buffer_Device(PJRT_Buffer_Device_Args* args);
 
 struct PJRT_Buffer_Memory_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
   PJRT_Memory* memory;  // out
 };
@@ -1723,7 +2221,7 @@ typedef PJRT_Error* PJRT_Buffer_Memory(PJRT_Buffer_Memory_Args* args);
 
 struct PJRT_Buffer_ReadyEvent_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
   // The caller is responsible for calling PJRT_Event_Destroy on `event`.
   PJRT_Event* event;  // out
@@ -1743,7 +2241,7 @@ typedef PJRT_Error* PJRT_Buffer_ReadyEvent(PJRT_Buffer_ReadyEvent_Args* args);
 
 struct PJRT_Buffer_UnsafePointer_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
   uintptr_t buffer_pointer;  // out
 };
@@ -1756,7 +2254,7 @@ typedef PJRT_Error* PJRT_Buffer_UnsafePointer(
 
 struct PJRT_Buffer_IncreaseExternalReferenceCount_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_IncreaseExternalReferenceCount_Args,
@@ -1772,7 +2270,7 @@ typedef PJRT_Error* PJRT_Buffer_IncreaseExternalReferenceCount(
 
 struct PJRT_Buffer_DecreaseExternalReferenceCount_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_DecreaseExternalReferenceCount_Args,
@@ -1786,7 +2284,7 @@ typedef PJRT_Error* PJRT_Buffer_DecreaseExternalReferenceCount(
 
 struct PJRT_Buffer_OpaqueDeviceMemoryDataPointer_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
   void* device_memory_ptr;  // out
 };
@@ -1803,7 +2301,7 @@ typedef PJRT_Error* PJRT_Buffer_OpaqueDeviceMemoryDataPointer(
 
 struct PJRT_CopyToDeviceStream_Destroy_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_CopyToDeviceStream* stream;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_CopyToDeviceStream_Destroy_Args, stream);
@@ -1814,7 +2312,7 @@ typedef PJRT_Error* PJRT_CopyToDeviceStream_Destroy(
 
 struct PJRT_CopyToDeviceStream_AddChunk_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_CopyToDeviceStream* stream;
   // Takes ownership of `chunk` (i.e. implementation will call chunk.deleter).
   PJRT_Chunk* chunk;
@@ -1835,7 +2333,7 @@ typedef PJRT_Error* PJRT_CopyToDeviceStream_AddChunk(
 
 struct PJRT_CopyToDeviceStream_TotalBytes_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_CopyToDeviceStream* stream;
   int64_t total_bytes;  // out
 };
@@ -1847,7 +2345,7 @@ typedef PJRT_Error* PJRT_CopyToDeviceStream_TotalBytes(
 
 struct PJRT_CopyToDeviceStream_GranuleSize_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_CopyToDeviceStream* stream;
   int64_t granule_size_in_bytes;  // out
 };
@@ -1861,7 +2359,7 @@ typedef PJRT_Error* PJRT_CopyToDeviceStream_GranuleSize(
 
 struct PJRT_CopyToDeviceStream_CurrentBytes_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_CopyToDeviceStream* stream;
   int64_t current_bytes;  // out
 };
@@ -1877,7 +2375,7 @@ typedef PJRT_Error* PJRT_CopyToDeviceStream_CurrentBytes(
 
 struct PJRT_TopologyDescription_Create_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   const char* topology_name;
   size_t topology_name_size;
   // Extra platform-specific options to create a client.
@@ -1894,7 +2392,7 @@ typedef PJRT_Error* PJRT_TopologyDescription_Create(
 
 struct PJRT_TopologyDescription_Destroy_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_TopologyDescription* topology;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_TopologyDescription_Destroy_Args, topology);
@@ -1905,7 +2403,7 @@ typedef PJRT_Error* PJRT_TopologyDescription_Destroy(
 
 struct PJRT_TopologyDescription_PlatformVersion_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_TopologyDescription* topology;
   // `platform_version` has the same lifetime as `topology`. It's owned by
   // `topology`.
@@ -1922,8 +2420,8 @@ typedef PJRT_Error* PJRT_TopologyDescription_PlatformVersion(
 
 struct PJRT_TopologyDescription_PlatformName_Args {
   size_t struct_size;
-  void* priv;
-  PJRT_TopologyDescription* topology;
+  PJRT_Extension_Base* extension_start;
+  const PJRT_TopologyDescription* topology;
   // `platform_name` has the same lifetime as `topology`. It is owned by
   // `topology`.
   const char* platform_name;  // out
@@ -1938,8 +2436,8 @@ typedef PJRT_Error* PJRT_TopologyDescription_PlatformName(
 
 struct PJRT_TopologyDescription_GetDeviceDescriptions_Args {
   size_t struct_size;
-  void* priv;
-  PJRT_TopologyDescription* topology;
+  PJRT_Extension_Base* extension_start;
+  const PJRT_TopologyDescription* topology;
   // Has the same lifetime as topology.
   PJRT_DeviceDescription* const* descriptions;  // out
   size_t num_descriptions;                      // out
@@ -1957,7 +2455,7 @@ typedef struct PJRT_SerializedTopology PJRT_SerializedTopology;
 
 struct PJRT_TopologyDescription_Serialize_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_TopologyDescription* topology;
 
   // Lives only as long as serialized_topology.
@@ -1977,9 +2475,22 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_TopologyDescription_Serialize_Args,
 typedef PJRT_Error* PJRT_TopologyDescription_Serialize(
     PJRT_TopologyDescription_Serialize_Args* args);
 
+struct PJRT_TopologyDescription_Deserialize_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  const char* serialized_topology;
+  size_t serialized_topology_size;
+
+  PJRT_TopologyDescription* topology;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TopologyDescription_Deserialize_Args, topology);
+
+typedef PJRT_Error* PJRT_TopologyDescription_Deserialize(
+    PJRT_TopologyDescription_Deserialize_Args* args);
+
 struct PJRT_TopologyDescription_Attributes_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_TopologyDescription* topology;
 
   // Only lives as long as topology.
@@ -1995,14 +2506,13 @@ typedef PJRT_Error* PJRT_TopologyDescription_Attributes(
 
 struct PJRT_Compile_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   const PJRT_TopologyDescription* topology;
   // Only needs to stay alive for the duration of the Compile call.
   // `program->format` and `program->format_size` are owned by the caller.
   const PJRT_Program* program;
   // TODO(b/240560013): consider putting some of option fields in priv.
-  // Serialized CompileOptionsProto
-  // (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/pjrt/compile_options.proto)
+  // Serialized CompileOptionsProto.
   const char* compile_options;
   size_t compile_options_size;
   // Optionally provided for performance-guided optimizations.
@@ -2016,21 +2526,6 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Compile_Args, executable);
 // PJRT_Client before execution.
 typedef PJRT_Error* PJRT_Compile(PJRT_Compile_Args* args);
 
-// -------------------------------- Extension ----------------------------------
-
-typedef enum {
-  PJRT_Structure_Type_Gpu_Custom_Call = 0,
-  PJRT_Structure_Type_Profiler,
-} PJRT_Structure_Type;
-
-// PJRT_Structure_Base contains a type and a pointer to next
-// PJRT_Structure_Base. The framework can go through this chain to find
-// structure and identify it with the type.
-typedef struct PJRT_Structure_Base {
-  PJRT_Structure_Type type;
-  const struct PJRT_Structure_Base* next;
-} PJRT_Structure_Base;
-
 // -------------------------------- API access ---------------------------------
 
 #define _PJRT_API_STRUCT_FIELD(fn_type) fn_type* fn_type
@@ -2038,7 +2533,7 @@ typedef struct PJRT_Structure_Base {
 // Please modify PJRT_Api_STRUCT_SIZE if the last field of PJRT_Api is changed.
 typedef struct PJRT_Api {
   size_t struct_size;
-  void* extension_start;
+  PJRT_Extension_Base* extension_start;
 
   PJRT_Api_Version pjrt_api_version;
 
@@ -2159,11 +2654,35 @@ typedef struct PJRT_Api {
   _PJRT_API_STRUCT_FIELD(PJRT_Client_TopologyDescription);
 
   _PJRT_API_STRUCT_FIELD(PJRT_Executable_GetCompiledMemoryStats);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_Memory_Kind_Id);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_ExecuteContext_Create);
+  _PJRT_API_STRUCT_FIELD(PJRT_ExecuteContext_Destroy);
+  _PJRT_API_STRUCT_FIELD(PJRT_Buffer_CopyRawToHost);
+  _PJRT_API_STRUCT_FIELD(PJRT_AsyncHostToDeviceTransferManager_Destroy);
+  _PJRT_API_STRUCT_FIELD(PJRT_AsyncHostToDeviceTransferManager_TransferData);
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_CreateBuffersForAsyncHostToDevice);
+  _PJRT_API_STRUCT_FIELD(PJRT_AsyncHostToDeviceTransferManager_RetrieveBuffer);
+  _PJRT_API_STRUCT_FIELD(PJRT_AsyncHostToDeviceTransferManager_Device);
+  _PJRT_API_STRUCT_FIELD(PJRT_AsyncHostToDeviceTransferManager_BufferCount);
+  _PJRT_API_STRUCT_FIELD(PJRT_AsyncHostToDeviceTransferManager_BufferSize);
+  _PJRT_API_STRUCT_FIELD(PJRT_AsyncHostToDeviceTransferManager_SetBufferError);
+  _PJRT_API_STRUCT_FIELD(PJRT_AsyncHostToDeviceTransferManager_AddMetadata);
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_DmaMap);
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_DmaUnmap);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_CreateUninitializedBuffer);
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_UpdateGlobalProcessInfo);
+  _PJRT_API_STRUCT_FIELD(PJRT_TopologyDescription_Deserialize);
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_CreateAliasBuffer);
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_FulfillAliasBuffer);
+  _PJRT_API_STRUCT_FIELD(PJRT_LoadedExecutable_GetDeviceAssignment);
 } PJRT_Api;
 
 enum {
   PJRT_Api_STRUCT_SIZE =
-      PJRT_STRUCT_SIZE(PJRT_Api, PJRT_Client_TopologyDescription)
+      PJRT_STRUCT_SIZE(PJRT_Api, PJRT_LoadedExecutable_GetDeviceAssignment)
 };
 
 #undef _PJRT_API_STRUCT_FIELD
@@ -2173,3 +2692,4 @@ enum {
 #endif
 
 #endif  // XLA_PJRT_C_PJRT_C_API_H_
+