diff --git a/.github/workflows/build-all.yml b/.github/workflows/build-all.yml
index 62617826b6..8977c2d637 100644
--- a/.github/workflows/build-all.yml
+++ b/.github/workflows/build-all.yml
@@ -9,11 +9,18 @@ on:
 
 jobs:
   build-tarballs:
-    runs-on: ubuntu-latest
-
     strategy:
       matrix:
-        target: [ x86_64, aarch64, arm, riscv64, ppc64le, s390x ]
+        include:
+          - { target: x86_64,      os: ubuntu-24.04 }
+          - { target: aarch64,     os: ubuntu-24.04-arm }
+          - { target: arm,         os: ubuntu-24.04-arm }
+          - { target: riscv64,     os: ubuntu-24.04-arm }
+          - { target: ppc64le,     os: ubuntu-24.04-arm }
+          - { target: s390x,       os: ubuntu-24.04-arm }
+          - { target: loongarch64, os: ubuntu-24.04-arm }
+
+    runs-on: ${{ matrix.os }}
 
     permissions:
       contents: read
@@ -21,21 +28,18 @@ jobs:
 
     steps:
     - name: Checkout Repository
-      uses: actions/checkout@v2
+      uses: actions/checkout@v4
+
+    - name: Install Podman
+      run: sudo apt-get update && sudo apt-get install -y podman qemu-user-static
 
     - name: Login to GitHub Container Registry
-      uses: docker/login-action@v1
+      uses: redhat-actions/podman-login@v1
       with:
         registry: ghcr.io
         username: ${{ github.repository_owner }}
         password: ${{ secrets.GITHUB_TOKEN }}
 
-    - name: Set up QEMU
-      uses: docker/setup-qemu-action@v1
-
-    - name: Set up Docker Buildx
-      uses: docker/setup-buildx-action@v3
-
     - name: Build a tarball
       run: ./dist.sh ${{ matrix.target }}
 
@@ -43,4 +47,5 @@ jobs:
       uses: actions/upload-artifact@v4
       with:
         name: ${{ matrix.target }}
-        path: mold-*.tar.gz
+        path: dist/mold-*.tar.gz
+        compression-level: 0
diff --git a/.github/workflows/build-native.yml b/.github/workflows/build-native.yml
new file mode 100644
index 0000000000..bce8d15820
--- /dev/null
+++ b/.github/workflows/build-native.yml
@@ -0,0 +1,44 @@
+name: Build native tarballs
+
+on:
+  push:
+  workflow_dispatch:
+
+jobs:
+  build-tarballs:
+    strategy:
+      matrix:
+        include:
+          - { target: x86_64,  os: ubuntu-24.04 }
+          - { target: aarch64, os: ubuntu-24.04-arm }
+          - { target: arm,     os: ubuntu-24.04-arm }
+
+    runs-on: ${{ matrix.os }}
+
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+    - name: Checkout Repository
+      uses: actions/checkout@v4
+
+    - name: Install Podman
+      run: sudo apt-get update && sudo apt-get install -y podman qemu-user-static
+
+    - name: Login to GitHub Container Registry
+      uses: redhat-actions/podman-login@v1
+      with:
+        registry: ghcr.io
+        username: ${{ github.repository_owner }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+
+    - name: Build a tarball
+      run: ./dist.sh ${{ matrix.target }}
+
+    - name: Upload artifact
+      uses: actions/upload-artifact@v4
+      with:
+        name: ${{ matrix.target }}
+        path: dist/mold-*.tar.gz
+        compression-level: 0
diff --git a/.github/workflows/build-x86.yml b/.github/workflows/build-x86.yml
deleted file mode 100644
index 828a9f14af..0000000000
--- a/.github/workflows/build-x86.yml
+++ /dev/null
@@ -1,37 +0,0 @@
-name: Build x86 tarball
-
-on:
-  push:
-    branches: [ main ]
-  workflow_dispatch:
-
-jobs:
-  build-tarball:
-    runs-on: ubuntu-latest
-
-    permissions:
-      contents: read
-      packages: write
-
-    steps:
-    - name: Checkout Repository
-      uses: actions/checkout@v2
-
-    - name: Login to GitHub Container Registry
-      uses: docker/login-action@v1
-      with:
-        registry: ghcr.io
-        username: ${{ github.repository_owner }}
-        password: ${{ secrets.GITHUB_TOKEN }}
-
-    - name: Set up Docker Buildx
-      uses: docker/setup-buildx-action@v3
-
-    - name: Build a tarball
-      run: ./dist.sh
-
-    - name: Upload artifact
-      uses: actions/upload-artifact@v4
-      with:
-        name: tarball
-        path: mold-*.tar.gz
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 16e27facc9..d99ae4ad93 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -14,7 +14,7 @@ jobs:
         - '-DMOLD_USE_TSAN=On'
     runs-on: ubuntu-24.04
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - uses: rui314/setup-mold@staging
     - run: sudo ./install-build-deps.sh
     - name: build
@@ -24,9 +24,9 @@ jobs:
         cd build
         cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_C_COMPILER=clang-18 -DCMAKE_CXX_COMPILER=clang++-18 ${{ matrix.target }} ..
         cmake --build . -j$(nproc)
-    - run: ctest --test-dir build -j$(nproc)
+    - run: cd build; ctest --output-on-failure -j$(nproc)
     - name: archive test results
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       if: failure()
       with:
         name: test-results-clang
@@ -38,55 +38,23 @@ jobs:
     runs-on: ubuntu-latest
     container: gcc:11.1.0
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: install-build-deps
       shell: bash
       run: |
         # Install cross toolchains
         ./install-build-deps.sh
         ./install-cross-tools.sh
-
-        # Install a RV32 toolchain from third party since it's not available
-        # as an Ubuntu package.
-        mkdir /rv32
-        wget -O- -q https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2023.07.07/riscv32-glibc-ubuntu-20.04-gcc-nightly-2023.07.07-nightly.tar.gz | tar -C /rv32 --strip-components=1 -xzf -
-
-        ln -sf /rv32/sysroot /usr/riscv32-linux-gnu
-        echo '/rv32/bin/riscv32-unknown-linux-gnu-gcc -L/usr/riscv32-linux-gnu "$@"' > /usr/bin/riscv32-linux-gnu-gcc
-        echo '/rv32/bin/riscv32-unknown-linux-gnu-g++ -L/usr/riscv32-linux-gnu "$@"' > /usr/bin/riscv32-linux-gnu-g++
-        chmod 755 /usr/bin/riscv32-linux-gnu-{gcc,g++}
-
-        for i in objdump objcopy strip; do
-          ln -sf /rv32/bin/riscv32-unknown-linux-gnu-$i /usr/bin/riscv32-linux-gnu-$i
-        done
-
-        # Install a LoongArch toolchain
-        mkdir /larch
-        wget -O- -q https://github.com/loongson/build-tools/releases/download/2024.08.08/x86_64-cross-tools-loongarch64-binutils_2.43-gcc_14.2.0-glibc_2.40.tar.xz | tar -C /larch --strip-components=1 --xz -xf -
-
-        cp -r /larch/loongarch64-unknown-linux-gnu/lib/* /larch/target/lib64
-        ln -sf /larch/target /usr/loongarch64-linux-gnu
-
-        for i in gcc g++ objdump objcopy strip; do
-          ln -sf /larch/bin/loongarch64-unknown-linux-gnu-$i /usr/bin/loongarch64-linux-gnu-$i
-        done
-
-        wget -O /usr/local/bin/qemu-loongarch64 -q https://github.com/loongson/build-tools/releases/download/2023.08.08/qemu-loongarch64
-        chmod 755 /usr/local/bin/qemu-loongarch64
-
-        # Install Intel SDE CPU emulator for CET-related tests
-        mkdir /sde
-        wget -O- -q https://downloadmirror.intel.com/813591/sde-external-9.33.0-2024-01-07-lin.tar.xz | tar -C /sde --strip-components=1 --xz -xf -
-        ln -s /sde/sde /usr/bin
+        .github/workflows/install-extras.sh
     - name: build
       run: |
         mkdir build
         cd build
         cmake ..
         cmake --build . -j$(nproc)
-    - run: ctest --test-dir build -j$(nproc)
+    - run: cd build; ctest --output-on-failure -j$(nproc)
     - name: archive test results
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       if: failure()
       with:
         name: test-results-gcc
@@ -100,6 +68,7 @@ jobs:
         distro:
         - alpine
         - archlinux
+        - debian:11 # GCC 10 and CMake 3.18 - the minimum supported by mold
         - fedora
         - gentoo/stage3
         - opensuse/tumbleweed
@@ -115,12 +84,20 @@ jobs:
         cd build
         cmake ..
         cmake --build . -j$(nproc)
-    - run: ctest --test-dir build -j$(nproc)
+    - run: cd build; ctest --output-on-failure -j$(nproc)
+    - name: archive test results
+      uses: actions/upload-artifact@v4
+      if: failure()
+      with:
+        name: test-results-${{ matrix.distro }}
+        path: |
+          build
+          !build/CMakeFiles
 
   build-macos:
-    runs-on: macos-12
+    runs-on: macos-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: build
       run: |
         mkdir build
@@ -131,7 +108,7 @@ jobs:
   build-windows:
     runs-on: windows-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: build
       run: |
         mkdir build
@@ -142,7 +119,7 @@ jobs:
   build-msys:
     runs-on: windows-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Setup MSYS2
       uses: msys2/setup-msys2@v2
       with:
@@ -160,7 +137,7 @@ jobs:
   build-freebsd:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Build and test
       uses: vmactions/freebsd-vm@v1
       with:
@@ -171,4 +148,4 @@ jobs:
           cd build
           cmake ..
           cmake --build . -j$(nproc)
-          ctest -j$(nproc)
+          ctest --output-on-failure -j$(nproc)
diff --git a/.github/workflows/install-extras.sh b/.github/workflows/install-extras.sh
new file mode 100755
index 0000000000..863a3e432f
--- /dev/null
+++ b/.github/workflows/install-extras.sh
@@ -0,0 +1,61 @@
+#!/bin/bash -x
+
+apt-get update
+apt-get install -y wget xz-utils
+
+# Install a 32-bit RISC-V toolchain
+mkdir /rv32
+wget -O- -q https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2023.07.07/riscv32-glibc-ubuntu-20.04-gcc-nightly-2023.07.07-nightly.tar.gz | tar -C /rv32 --strip-components=1 -xzf -
+
+ln -sf /rv32/sysroot /usr/riscv32-linux-gnu
+echo '/rv32/bin/riscv32-unknown-linux-gnu-gcc -L/usr/riscv32-linux-gnu "$@"' > /usr/bin/riscv32-linux-gnu-gcc
+echo '/rv32/bin/riscv32-unknown-linux-gnu-g++ -L/usr/riscv32-linux-gnu "$@"' > /usr/bin/riscv32-linux-gnu-g++
+chmod 755 /usr/bin/riscv32-linux-gnu-{gcc,g++}
+
+for i in objdump objcopy strip; do
+  ln -sf /rv32/bin/riscv32-unknown-linux-gnu-$i /usr/bin/riscv32-linux-gnu-$i
+done
+
+# Install a LoongArch toolchain
+mkdir /larch
+wget -O- -q https://github.com/loongson/build-tools/releases/download/2024.11.01/x86_64-cross-tools-loongarch64-binutils_2.43.1-gcc_14.2.0-glibc_2.40.tar.xz | tar -C /larch --strip-components=1 --xz -xf -
+
+cp -r /larch/loongarch64-unknown-linux-gnu/lib/* /larch/target/lib64
+ln -sf /larch/target /usr/loongarch64-linux-gnu
+
+for i in gcc g++ objdump objcopy strip; do
+  ln -sf /larch/bin/loongarch64-unknown-linux-gnu-$i /usr/bin/loongarch64-linux-gnu-$i
+done
+
+wget -O /usr/local/bin/qemu-loongarch64 -q https://github.com/loongson/build-tools/releases/download/2024.11.01/qemu-loongarch64
+chmod 755 /usr/local/bin/qemu-loongarch64
+
+# Install ARM64 big-endian toolchain
+mkdir /aarch64be
+wget -O- -q https://sources.buildroot.net/toolchain-external-arm-aarch64-be/arm-gnu-toolchain-13.3.rel1-x86_64-aarch64_be-none-linux-gnu.tar.xz | tar -C /aarch64be --strip-components=1 --xz -xf -
+
+ln -sf /aarch64be/aarch64_be-none-linux-gnu/libc /usr/aarch64_be-linux-gnu
+echo '/aarch64be/bin/aarch64_be-none-linux-gnu-gcc -L/usr/aarch64_be-linux-gnu "$@"' > /usr/bin/aarch64_be-linux-gnu-gcc
+echo '/aarch64be/bin/aarch64_be-none-linux-gnu-g++ -L/usr/aarch64_be-linux-gnu "$@"' > /usr/bin/aarch64_be-linux-gnu-g++
+chmod 755 /usr/bin/aarch64_be-linux-gnu-{gcc,g++}
+
+for i in objdump objcopy strip; do
+  ln -sf /aarch64be/bin/aarch64_be-none-linux-gnu-$i /usr/bin/aarch64_be-linux-gnu-$i
+done
+
+# Install SH4 big-endian toolchain
+mkdir /sh4aeb
+wget -O- -q https://toolchains.bootlin.com/downloads/releases/toolchains/sh-sh4aeb/tarballs/sh-sh4aeb--glibc--stable-2024.05-1.tar.xz | tar -C /sh4aeb --strip-components=1 --xz -xf -
+ln -sf /sh4aeb/sh4aeb-buildroot-linux-gnu/sysroot /usr/sh4aeb-linux-gnu
+echo '/sh4aeb/bin/sh4aeb-linux-gcc -L/usr/sh4aeb-linux-gnu "$@"' > /usr/bin/sh4aeb-linux-gnu-gcc
+echo '/sh4aeb/bin/sh4aeb-linux-g++ -L/usr/sh4aeb-linux-gnu "$@"' > /usr/bin/sh4aeb-linux-gnu-g++
+chmod 755 /usr/bin/sh4aeb-linux-gnu-{gcc,g++}
+
+for i in objdump objcopy strip; do
+  ln -sf /sh4aeb/bin/sh4aeb-linux-$i /usr/bin/sh4aeb-linux-gnu-$i
+done
+
+# Install Intel SDE CPU emulator for CET-related tests
+mkdir /sde
+wget -O- -q https://downloadmirror.intel.com/831748/sde-external-9.44.0-2024-08-22-lin.tar.xz | tar -C /sde --strip-components=1 --xz -xf -
+ln -s /sde/sde64 /usr/bin
diff --git a/.github/workflows/update-manpage.yml b/.github/workflows/update-manpage.yml
index 1107e7741a..9d868b5f02 100644
--- a/.github/workflows/update-manpage.yml
+++ b/.github/workflows/update-manpage.yml
@@ -14,7 +14,7 @@ jobs:
 
     steps:
     - name: Check out repository
-      uses: actions/checkout@v2
+      uses: actions/checkout@v4
 
     - name: Install ronn
       run: sudo apt-get update && sudo apt-get install -y ronn
diff --git a/.gitignore b/.gitignore
index cf1f526619..29390501b4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,5 +8,6 @@ gentoo
 /mold-*.tar.gz
 /build*
 /mold
+/dist
 /deploy
 !/build.sh
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e1bc009043..d127546ccb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,7 +44,7 @@
 # features and behave exactly the same.
 
 cmake_minimum_required(VERSION 3.14)
-project(mold VERSION 2.34.1)
+project(mold VERSION 2.36.0)
 
 include(CMakeDependentOption)
 include(CheckSymbolExists)
@@ -180,7 +180,7 @@ endif()
 # be stable on 32-bit targets.
 cmake_dependent_option(
   MOLD_USE_MIMALLOC "Use mimalloc" ON
-  "CMAKE_SIZEOF_VOID_P EQUAL 8; NOT APPLE; NOT ANDROID; NOT OPENBSD" OFF)
+  "CMAKE_SIZEOF_VOID_P EQUAL 8; NOT APPLE; NOT ANDROID; NOT OPENBSD; NOT MOLD_USE_ASAN; NOT MOLD_USE_TSAN" OFF)
 
 cmake_dependent_option(
   MOLD_USE_SYSTEM_MIMALLOC "Use system or vendored mimalloc" OFF
@@ -193,7 +193,6 @@ if(MOLD_USE_MIMALLOC)
   if(MOLD_USE_SYSTEM_MIMALLOC)
     find_package(mimalloc REQUIRED)
     target_link_libraries(mold PRIVATE mimalloc)
-    target_sources(mold PRIVATE lib/mimalloc.cc)
   else()
     function(mold_add_mimalloc)
       set(MI_BUILD_STATIC ON CACHE INTERNAL "")
@@ -284,6 +283,7 @@ if(NOT APPLE AND NOT MSVC)
 endif()
 
 check_symbol_exists(madvise sys/mman.h HAVE_MADVISE)
+check_symbol_exists(uname sys/utsname.h HAVE_UNAME)
 
 # Create a .cc file containing the current git hash for `mold --version`.
 add_custom_target(git_hash
@@ -316,12 +316,14 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR})
 # compiler instances. This is hacky but greatly reduces compile time
 # on a multicore machine.
 list(APPEND MOLD_ELF_TARGETS
-  X86_64 I386 ARM64 ARM32 RV32LE RV32BE RV64LE RV64BE PPC32 PPC64V1 PPC64V2
-  S390X SPARC64 M68K SH4 LOONGARCH32 LOONGARCH64)
+  X86_64 I386 ARM64LE ARM64BE ARM32 RV32LE RV32BE RV64LE RV64BE PPC32
+  PPC64V1 PPC64V2 S390X SPARC64 M68K SH4LE SH4BE LOONGARCH32 LOONGARCH64)
 
 list(APPEND MOLD_ELF_TEMPLATE_FILES
+  src/arch-arm64.cc
   src/arch-loongarch.cc
   src/arch-riscv.cc
+  src/arch-sh4.cc
   src/cmdline.cc
   src/filetype.cc
   src/gc-sections.cc
@@ -390,19 +392,18 @@ target_sources(mold PRIVATE
   lib/filepath.cc
   lib/glob.cc
   lib/hyperloglog.cc
+  lib/mimalloc.cc
   lib/multi-glob.cc
   lib/perf.cc
   lib/random.cc
   lib/tar.cc
   src/arch-arm32.cc
-  src/arch-arm64.cc
   src/arch-i386.cc
   src/arch-m68k.cc
   src/arch-ppc32.cc
   src/arch-ppc64v1.cc
   src/arch-ppc64v2.cc
   src/arch-s390x.cc
-  src/arch-sh4.cc
   src/arch-sparc64.cc
   src/arch-x86-64.cc
   src/config.cc
@@ -446,7 +447,6 @@ if(NOT CMAKE_SKIP_INSTALL_RULES)
   install(TARGETS mold RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
   install(FILES docs/mold.1 DESTINATION ${CMAKE_INSTALL_MANDIR}/man1/)
   install(FILES LICENSE DESTINATION ${CMAKE_INSTALL_DOCDIR})
-  install(FILES "LICENSE.third-party" DESTINATION "${CMAKE_INSTALL_DOCDIR}")
 
   function(mold_install_relative_symlink OLD NEW)
     install(CODE "
diff --git a/README.md b/README.md
index c6ddf37bd0..7038ffef8e 100644
--- a/README.md
+++ b/README.md
@@ -76,12 +76,12 @@ installation location by passing `-DCMAKE_INSTALL_PREFIX=<directory>`.
 For other cmake options, see the comments in `CMakeLists.txt`.
 
 If you are not using a recent enough Linux distribution, or if `cmake` does
-not work for you for any reason, you can use Docker to build mold in a Docker
-environment. To do so, run `./dist.sh` in this directory instead of using
-`cmake`. The shell script will pull a Docker image, build mold and auxiliary
+not work for you for any reason, you can use Podman to build mold in a
+container. To do so, run `./dist.sh` in this directory instead of using
+`cmake`. The shell script will pull a container image, build mold and auxiliary
 files inside it, and package them into a single tar file named
-`mold-$version-$arch-linux.tar.gz`. You can extract the tar file anywhere and
-use the mold executable within it.
+`dist/mold-$version-$arch-linux.tar.gz`. You can extract the tar file anywhere
+and use the mold executable in it.
 
 ## How to use
 
@@ -264,3 +264,4 @@ the following people and organizations who have sponsored $128/month or more:
 - [Bryant Biggs](https://github.com/bryantbiggs)
 - [kraptor23](https://github.com/kraptor23)
 - [Jinkyu Yi](https://github.com/jincreator)
+- [Pedro Navarro](https://github.com/pedronavf)
diff --git a/dist.sh b/dist.sh
index af23aef270..464ebce6c5 100755
--- a/dist.sh
+++ b/dist.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 #
 # This script creates a mold binary distribution. The output is written in
-# this directory as `mold-$version-$arch-linux.tar.gz` (e.g.,
+# `dist` directory as `mold-$version-$arch-linux.tar.gz` (e.g.,
 # `mold-1.0.3-x86_64-linux.tar.gz`).
 #
 # The mold executable created by this script is statically linked to
@@ -18,7 +18,7 @@
 # commit with release tags by rebuilding the binaries yourself.
 #
 # Debian provides snapshot.debian.org to host all historical binary
-# packages. We use it to construct Docker images pinned to a
+# packages. We use it to construct Podman images pinned to a
 # particular timestamp.
 #
 # We aim to use a reasonably old Debian version because we'll dynamically
@@ -33,15 +33,14 @@
 # We need a recent version of Clang to build mold. If it's not available
 # via apt-get, we'll build it ourselves.
 #
-# You may need to run the following command to use Docker with Qemu:
-#
-#  $ docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
+# You may need to install qemu-user-static package to build non-native
+# binaries.
 
 set -e -x
 cd "$(dirname $0)"
 
 usage() {
-  echo "Usage: $0 [ x86_64 | aarch64 | arm | riscv64 | ppc64le | s390x ]"
+  echo "Usage: $0 [ x86_64 | aarch64 | arm | riscv64 | ppc64le | s390x | loongarch64 ]"
   exit 1
 }
 
@@ -61,52 +60,46 @@ case $# in
   usage
 esac
 
-echo "$arch" | grep -Eq '^(x86_64|aarch64|arm|riscv64|ppc64le|s390x)$' || usage
-
-version=$(sed -n 's/^project(mold VERSION \(.*\))/\1/p' CMakeLists.txt)
-dest=mold-$version-$arch-linux
-
+# Create a Podman image.
 if [ "$GITHUB_REPOSITORY" = '' ]; then
   image=mold-builder-$arch
-  docker_build="docker build --platform linux/$arch -t $image -"
+  image_build="podman build --arch $arch -t $image -"
 else
   # If this script is running on GitHub Actions, we want to cache
-  # the created Docker image in GitHub's Docker repostiory.
+  # the created container image in GitHub's container repostiory.
   image=ghcr.io/$GITHUB_REPOSITORY/mold-builder-$arch
-  docker_build="docker buildx build --platform linux/$arch -t $image --push --cache-to type=inline --cache-from type=registry,ref=ghcr.io/$GITHUB_REPOSITORY/mold-builder-$arch -"
+  image_build="podman build --arch $arch -t $image --output=type=registry --layers --cache-to $image --cache-from $image -"
 fi
 
-# Create a Docker image.
 case $arch in
 x86_64)
-  # Debian 8 (Jessie) released in April 2015
-  cat <<EOF | $docker_build
-FROM debian:jessie-20210326@sha256:32ad5050caffb2c7e969dac873bce2c370015c2256ff984b70c1c08b3a2816a0
+  # Debian 9 (Stretch) released in June 2017
+  cat <<EOF | $image_build
+FROM docker.io/library/debian:stretch@sha256:c5c5200ff1e9c73ffbf188b4a67eb1c91531b644856b4aefe86a58d2f0cb05be
 ENV DEBIAN_FRONTEND=noninteractive TZ=UTC
 RUN sed -i -e '/^deb/d' -e 's/^# deb /deb [trusted=yes] /g' /etc/apt/sources.list && \
   echo 'Acquire::Retries "10"; Acquire::http::timeout "10"; Acquire::Check-Valid-Until "false";' > /etc/apt/apt.conf.d/80-retries && \
   apt-get update && \
-  apt-get install -y --no-install-recommends wget bzip2 file make autoconf gcc g++ libssl-dev && \
+  apt-get install -y --no-install-recommends wget file make gcc g++ zlib1g-dev libssl-dev && \
   rm -rf /var/lib/apt/lists
 
 # Build CMake 3.27
 RUN mkdir /build && \
   cd /build && \
-  wget -O- --no-check-certificate https://cmake.org/files/v3.27/cmake-3.27.7.tar.gz | tar xzf - --strip-components=1 && \
+  wget -O- --no-check-certificate --progress=dot:mega https://cmake.org/files/v3.27/cmake-3.27.7.tar.gz | tar xzf - --strip-components=1 && \
   ./bootstrap --parallel=\$(nproc) && \
   make -j\$(nproc) && \
   make install && \
   rm -rf /build
 
-# Build GCC 10
+# Build GCC 14
 RUN mkdir /build && \
   cd /build && \
-  wget -O- --no-check-certificate https://ftpmirror.gnu.org/gnu/gcc/gcc-10.5.0/gcc-10.5.0.tar.gz | tar xzf - --strip-components=1 && \
-  mkdir isl gmp mpc mpfr && \
-  wget -O- --no-check-certificate https://gcc.gnu.org/pub/gcc/infrastructure/isl-0.18.tar.bz2 | tar xjf - --strip-components=1 -C isl && \
-  wget -O- --no-check-certificate https://ftpmirror.gnu.org/gnu/gmp/gmp-6.1.2.tar.bz2 | tar xjf - --strip-components=1 -C gmp && \
-  wget -O- --no-check-certificate https://ftpmirror.gnu.org/gnu/mpc/mpc-1.2.1.tar.gz | tar xzf - --strip-components=1 -C mpc && \
-  wget -O- --no-check-certificate https://ftpmirror.gnu.org/gnu/mpfr/mpfr-4.1.0.tar.gz | tar xzf - --strip-components=1 -C mpfr && \
+  wget -O- --no-check-certificate --progress=dot:mega https://ftpmirror.gnu.org/gcc/gcc-14.2.0/gcc-14.2.0.tar.gz | tar xzf - --strip-components=1 && \
+  mkdir gmp mpc mpfr && \
+  wget -O- --no-check-certificate --progress=dot:mega https://ftpmirror.gnu.org/gmp/gmp-6.3.0.tar.gz | tar xzf - --strip-components=1 -C gmp && \
+  wget -O- --no-check-certificate --progress=dot:mega https://ftpmirror.gnu.org/mpc/mpc-1.3.1.tar.gz | tar xzf - --strip-components=1 -C mpc && \
+  wget -O- --no-check-certificate --progress=dot:mega https://ftpmirror.gnu.org/mpfr/mpfr-4.2.1.tar.gz | tar xzf - --strip-components=1 -C mpfr && \
   ./configure --prefix=/usr --enable-languages=c,c++ --disable-bootstrap --disable-multilib && \
   make -j\$(nproc) && \
   make install && \
@@ -116,7 +109,7 @@ RUN mkdir /build && \
 # Build GNU binutils 2.43
 RUN mkdir /build && \
   cd /build && \
-  wget -O- --no-check-certificate https://ftp.gnu.org/gnu/binutils/binutils-2.43.tar.gz | tar xzf - --strip-components=1 && \
+  wget -O- --no-check-certificate --progress=dot:mega https://ftpmirror.gnu.org/binutils/binutils-2.43.tar.gz | tar xzf - --strip-components=1 && \
   ./configure --prefix=/usr && \
   make -j\$(nproc) && \
   make install && \
@@ -125,17 +118,16 @@ RUN mkdir /build && \
 # Build Python 3.12.7
 RUN mkdir /build && \
   cd /build && \
-  wget -O- --no-check-certificate https://www.python.org/ftp/python/3.12.7/Python-3.12.7.tgz | tar xzf - --strip-components=1 && \
+  wget -O- --no-check-certificate --progress=dot:mega https://www.python.org/ftp/python/3.12.7/Python-3.12.7.tgz | tar xzf - --strip-components=1 && \
   ./configure && \
   make -j\$(nproc) && \
   make install && \
-  ln -sf /usr/local/bin/python3 /usr/local/bin/python && \
   rm -rf /build
 
-# Build LLVM 18.1.8
+# Build LLVM 19
 RUN mkdir /build && \
   cd /build && \
-  wget -O- --no-check-certificate https://github.com/llvm/llvm-project/archive/refs/tags/llvmorg-18.1.8.tar.gz | tar xzf - --strip-components=1 && \
+  wget -O- --no-check-certificate --progress=dot:mega https://github.com/llvm/llvm-project/archive/refs/tags/llvmorg-19.1.7.tar.gz | tar xzf - --strip-components=1 && \
   mkdir b && \
   cd b && \
   cmake -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS=clang ../llvm && \
@@ -145,13 +137,13 @@ RUN mkdir /build && \
 EOF
   ;;
 aarch64 | arm | ppc64le | s390x)
-  # Debian 10 (Bullseye) released in July 2019
+  # Debian 11 (Bullseye) released in August 2021
   #
   # We don't want to build Clang for these targets with Qemu becuase
   # that'd take extremely long time. Also I believe old build machines
   # are usually x86-64.
-  cat <<EOF | $docker_build
-FROM debian:bullseye-20240904@sha256:8ccc486c29a3ad02ad5af7f1156e2152dff3ba5634eec9be375269ef123457d8
+  cat <<EOF | $image_build
+FROM docker.io/library/debian:bullseye-20240904@sha256:8ccc486c29a3ad02ad5af7f1156e2152dff3ba5634eec9be375269ef123457d8
 ENV DEBIAN_FRONTEND=noninteractive TZ=UTC
 RUN sed -i -e '/^deb/d' -e 's/^# deb /deb [trusted=yes] /g' /etc/apt/sources.list && \
   echo 'Acquire::Retries "10"; Acquire::http::timeout "10"; Acquire::Check-Valid-Until "false";' > /etc/apt/apt.conf.d/80-retries && \
@@ -163,8 +155,8 @@ RUN sed -i -e '/^deb/d' -e 's/^# deb /deb [trusted=yes] /g' /etc/apt/sources.lis
 EOF
   ;;
 riscv64)
-  cat <<EOF | $docker_build
-FROM riscv64/debian:unstable-20240926@sha256:25654919c2926f38952cdd14b3300d83d13f2d820715f78c9f4b7a1d9399bf48
+  cat <<EOF | $image_build
+FROM docker.io/riscv64/debian:unstable-20240926@sha256:25654919c2926f38952cdd14b3300d83d13f2d820715f78c9f4b7a1d9399bf48
 ENV DEBIAN_FRONTEND=noninteractive TZ=UTC
 RUN sed -i -e '/^URIs/d' -e 's/^# http/URIs: http/' /etc/apt/sources.list.d/debian.sources && \
   echo 'Acquire::Retries "10"; Acquire::http::timeout "10"; Acquire::Check-Valid-Until "false";' > /etc/apt/apt.conf.d/80-retries && \
@@ -175,8 +167,26 @@ RUN sed -i -e '/^URIs/d' -e 's/^# http/URIs: http/' /etc/apt/sources.list.d/debi
   rm -rf /var/lib/apt/lists
 EOF
   ;;
+loongarch64)
+  # LoongArch build is not reproducible yet
+  cat <<EOF | $image_build
+FROM docker.io/loongarch64/debian:sid
+ENV DEBIAN_FRONTEND=noninteractive TZ=UTC
+RUN apt-get update && \
+  apt-get install -y --no-install-recommends build-essential gcc-14 g++-14 clang-19 cmake && \
+  ln -sf /usr/bin/clang-19 /usr/bin/clang && \
+  ln -sf /usr/bin/clang++-19 /usr/bin/clang++ && \
+  rm -rf /var/lib/apt/lists
+EOF
+  ;;
+*)
+  usage
+  ;;
 esac
 
+version=$(sed -n 's/^project(mold VERSION \(.*\))/\1/p' CMakeLists.txt)
+dest=mold-$version-$arch-linux
+
 # Source tarballs available on GitHub don't contain .git history.
 # Clone the repo if missing.
 [ -d .git ] || git clone --branch v$version --depth 1 --bare https://github.com/rui314/mold .git
@@ -186,7 +196,10 @@ esac
 timestamp="$(git log -1 --format=%ci)"
 
 # Build mold in a container.
-docker run --platform linux/$arch -i --rm -v "$(pwd):/mold" $image bash -c "
+mkdir -p dist
+
+podman run --arch $arch -it --rm --userns=host --pids-limit=-1 --network=none \
+  -v "$(pwd):/mold:ro" -v "$(pwd)/dist:/dist" $image bash -c "
 set -e
 mkdir /build
 cd /build
@@ -195,11 +208,10 @@ cmake --build . -j\$(nproc)
 cmake --install .
 cmake -DMOLD_USE_MOLD=1 .
 cmake --build . -j\$(nproc)
-ctest -j\$(nproc)
+ctest --output-on-failure -j\$(nproc)
 cmake --install . --prefix $dest --strip
 find $dest -print | xargs touch --no-dereference --date='$timestamp'
-find $dest -print | sort | tar -cf - --no-recursion --files-from=- | gzip -9nc > /mold/$dest.tar.gz
-cp mold /mold
-chown $(id -u):$(id -g) /mold/$dest.tar.gz /mold/mold
-sha256sum /mold/$dest.tar.gz
+find $dest -print | sort | tar -cf - --no-recursion --files-from=- | gzip -9nc > /dist/$dest.tar.gz
+cp mold /dist
+sha256sum /dist/$dest.tar.gz
 "
diff --git a/docs/mold.1 b/docs/mold.1
index f0d9d1f731..9689b83a2c 100644
--- a/docs/mold.1
+++ b/docs/mold.1
@@ -1,6 +1,6 @@
 .\" generated with Ronn-NG/v0.9.1
 .\" http://github.com/apjanke/ronn-ng/tree/0.9.1
-.TH "MOLD" "1" "August 2024" ""
+.TH "MOLD" "1" "February 2025" ""
 .SH "NAME"
 \fBmold\fR \- a modern linker
 .SH "SYNOPSIS"
@@ -10,9 +10,9 @@
 .SS "How to use"
 See \fIhttps://github\.com/rui314/mold#how\-to\-use\fR\.
 .SS "Compatibility"
-\fBMold\fR is designed to be a drop\-in replacement for the GNU linkers for linking user\-land programs\. If your user\-land program cannot be built due to missing command\-line options, please file a bug at \fIhttps://github\.com/rui314/mold/issues\fR\.
+\fBmold\fR is designed to be a drop\-in replacement for the GNU linkers for linking user\-land programs\. If your user\-land program cannot be built due to missing command\-line options, please file a bug at \fIhttps://github\.com/rui314/mold/issues\fR\.
 .P
-Mold supports a very limited set of linker script features, which is just sufficient to read \fB/usr/lib/x86_64\-linux\-gnu/libc\.so\fR on Linux systems (on Linux, that file is contrary to its name not a shared library but an ASCII linker script that loads a real \fBlibc\.so\fR file\.)
+\fBmold\fR supports a very limited set of linker script features, which is just sufficient to read \fB/usr/lib/x86_64\-linux\-gnu/libc\.so\fR on Linux systems (on Linux, that file is contrary to its name not a shared library but an ASCII linker script that loads a real \fBlibc\.so\fR file\.)
 .P
 Beyond that, we have no plan to support any additional linker script features\. The linker script is an ad\-hoc, over\-designed, complex language which we believe needs to be replaced by a simpler mechanism\. We have a plan to add a replacement for the linker script to \fBmold\fR instead\.
 .SS "Archive symbol resolution"
@@ -40,13 +40,11 @@ Typically, an ELF module that exports a symbol also imports the same symbol\. Su
 .P
 Let's take \fBmalloc\fR as an example\. Assume that you define your version of \fBmalloc\fR in your main executable file\. Then, all \fBmalloc\fR calls from any module are resolved to your function instead of the one in libc, because the executable is always at the beginning of the dynamic symbol search list\. Note that even \fBmalloc\fR calls within libc are resolved to your definition since libc exports and imports \fBmalloc\fR\. Therefore, by defining \fBmalloc\fR yourself, you can overwrite a library function, and the \fBmalloc\fR in libc becomes dead code\.
 .P
-These Unix semantics are tricky and sometimes considered harmful\. For example, assume that you accidentally define \fBatoi\fR as a global function in your executable that behaves completely differently from the one in the C standard\. Then, all \fBatoi\fR function calls from any modules (even function calls within libc) are redirected to your function instead of the one in libc, which will very likely cause a problem\.
-.P
-That is a somewhat surprising consequence for an accidental name conflict\. On the other hand, this semantic is sometimes useful because it allows users to override library functions without rebuilding modules containing them\.
+These Unix semantics are tricky and sometimes considered harmful\. For example, assume that you accidentally define \fBatoi\fR as a global function in your executable that behaves completely differently from the one in the C standard\. Then, all \fBatoi\fR function calls from any modules (even function calls within libc) are redirected to your function instead of the one in libc, which will very likely cause a problem\. That is a somewhat surprising consequence for an accidental name conflict\. On the other hand, this semantic is sometimes useful because it allows users to override library functions without rebuilding modules containing them\.
 .P
 Whether good or bad, you should keep these semantics in mind to understand Unix linkers' behaviors\.
 .SS "Build reproducibility"
-\fBmold\fR's output is deterministic\. That is, if you pass the same object files and the same command\-line options to the same version of \fBmold\fR, it is guaranteed that \fBmold\fR produces the bit\-by\-bit identical output\. The linker's internal randomness, such as the timing of thread scheduling or iteration orders of hash tables, doesn't affect the output\.
+\fBmold\fR's output is deterministic\. That is, if you pass the same object files and the same command\-line options to the same version of \fBmold\fR, it is guaranteed that \fBmold\fR produces the bit\-for\-bit identical output\. The linker's internal randomness, such as the timing of thread scheduling or iteration orders of hash tables, doesn't affect the output\.
 .P
 \fBmold\fR does not have any host\-specific default settings\. This is contrary to the GNU linkers, for which some configurable values, such as system\-dependent library search paths, are hard\-coded\. \fBmold\fR depends only on its command\-line arguments\.
 .SH "OPTION NOTATIONS"
@@ -67,7 +65,7 @@ Synonym for \fB\-\-color\-diagnostics=auto\fR\.
 \fB\-\-no\-color\-diagnostics\fR
 Synonym for \fB\-\-color\-diagnostics=never\fR\.
 .TP
-\fB\-\-detach\fR, `\-\-no\-detach
+\fB\-\-detach\fR, \fB\-\-no\-detach\fR
 Permit or do not permit mold to create a debug info file in the background\.
 .TP
 \fB\-\-fork\fR, \fB\-\-no\-fork\fR
@@ -355,6 +353,11 @@ Report undefined symbols (even with \fB\-\-shared\fR)\.
 \fB\-\-noinhibit\-exec\fR
 Create an output file even if errors occur\.
 .TP
+\fB\-\-package\-metadata\fR=\fIpercent\-encoded\-string\fR
+Embed a specified string into the \fB\.note\.package\fR section\. This option is designed for build scripts that generate binary packages, such as \fB\.rpm\fR or \fB\.deb\fR, to include package metadata in each executable\. It simplifies the process of identifying the corresponding package for a given executable or core file\.
+.IP
+An argument to this option is treated as percent\-encoded and decoded before being inserted into the section, allowing you to avoid the use of the comma (\fB,\fR) character in the argument\. This is useful because the compiler replaces all occurrences of commas in \fB\-Wl,\fR with spaces before forwarding them to the linker\. Note that \fBmold\fR always interprets the argument as percent\-encoded, so you also need to escape all occurrences of \fB%\fR as \fB%25\fR\.
+.TP
 \fB\-\-pack\-dyn\-relocs\fR=[ \fBrelr\fR | \fBnone\fR ]
 If \fBrelr\fR is specified, all \fBR_*_RELATIVE\fR relocations are put into \fB\.relr\.dyn\fR section instead of \fB\.rel\.dyn\fR or \fB\.rela\.dyn\fR section\. Since \fB\.relr\.dyn\fR section uses a space\-efficient encoding scheme, specifying this flag can reduce the size of the output\. This is typically most effective for position\-independent executable\.
 .IP
diff --git a/docs/mold.md b/docs/mold.md
index 19e7c25b3c..91915080cb 100644
--- a/docs/mold.md
+++ b/docs/mold.md
@@ -13,12 +13,12 @@ See <https://github.com/rui314/mold#how-to-use>.
 
 ### Compatibility
 
-**Mold** is designed to be a drop-in replacement for the GNU linkers for
+`mold` is designed to be a drop-in replacement for the GNU linkers for
 linking user-land programs. If your user-land program cannot be built due to
 missing command-line options, please file a bug at
 <https://github.com/rui314/mold/issues>.
 
-Mold supports a very limited set of linker script features, which is just
+`mold` supports a very limited set of linker script features, which is just
 sufficient to read `/usr/lib/x86_64-linux-gnu/libc.so` on Linux systems (on
 Linux, that file is contrary to its name not a shared library but an ASCII
 linker script that loads a real `libc.so` file.)
@@ -104,11 +104,10 @@ assume that you accidentally define `atoi` as a global function in your
 executable that behaves completely differently from the one in the C standard.
 Then, all `atoi` function calls from any modules (even function calls within
 libc) are redirected to your function instead of the one in libc, which will
-very likely cause a problem.
-
-That is a somewhat surprising consequence for an accidental name conflict. On
-the other hand, this semantic is sometimes useful because it allows users to
-override library functions without rebuilding modules containing them.
+very likely cause a problem. That is a somewhat surprising consequence for an
+accidental name conflict. On the other hand, this semantic is sometimes useful
+because it allows users to override library functions without rebuilding
+modules containing them.
 
 Whether good or bad, you should keep these semantics in mind to understand
 Unix linkers' behaviors.
@@ -117,7 +116,7 @@ Unix linkers' behaviors.
 
 `mold`'s output is deterministic. That is, if you pass the same object files
 and the same command-line options to the same version of `mold`, it is
-guaranteed that `mold` produces the bit-by-bit identical output. The linker's
+guaranteed that `mold` produces the bit-for-bit identical output. The linker's
 internal randomness, such as the timing of thread scheduling or iteration
 orders of hash tables, doesn't affect the output.
 
@@ -152,7 +151,7 @@ but as `-o magic`.
 * `--no-color-diagnostics`:
   Synonym for `--color-diagnostics=never`.
 
-* `--detach`, `--no-detach:
+* `--detach`, `--no-detach`:
   Permit or do not permit mold to create a debug info file in the background.
 
 * `--fork`, `--no-fork`:
@@ -618,6 +617,21 @@ but as `-o magic`.
 * `--noinhibit-exec`:
   Create an output file even if errors occur.
 
+* `--package-metadata`=_percent-encoded-string_:
+  Embed a specified string into the `.note.package` section. This option
+  is designed for build scripts that generate binary packages, such as
+  `.rpm` or `.deb`, to include package metadata in each executable. It
+  simplifies the process of identifying the corresponding package for a
+  given executable or core file.
+
+  An argument to this option is treated as percent-encoded and decoded
+  before being inserted into the section, allowing you to avoid the use of
+  the comma (`,`) character in the argument. This is useful because the
+  compiler replaces all occurrences of commas in `-Wl,` with spaces before
+  forwarding them to the linker. Note that `mold` always interprets the
+  argument as percent-encoded, so you also need to escape all occurrences
+  of `%` as `%25`.
+
 * `--pack-dyn-relocs`=[ `relr` | `none` ]:
   If `relr` is specified, all `R_*_RELATIVE` relocations are put into
   `.relr.dyn` section instead of `.rel.dyn` or `.rela.dyn` section. Since
diff --git a/install-build-deps.sh b/install-build-deps.sh
index 60e835a298..cf746b9022 100755
--- a/install-build-deps.sh
+++ b/install-build-deps.sh
@@ -14,7 +14,10 @@ ubuntu | pop | linuxmint | debian | raspbian | neon)
   if [ "$ID-$VERSION_ID" = ubuntu-20.04 ]; then apt-get install -y g++-10; fi
   ;;
 fedora | amzn | rhel)
-  dnf install -y gcc-g++ cmake glibc-static libstdc++-static diffutils util-linux
+  dnf install -y gcc-g++ cmake glibc-static libstdc++-static diffutils util-linux tar
+  ;;
+rocky)
+  dnf install -y gcc-g++ cmake diffutils util-linux
   ;;
 opensuse-*)
   zypper install -y make cmake gcc-c++ glibc-devel-static tar diffutils util-linux
diff --git a/lib/common.h b/lib/common.h
index 61edb8d7b1..dc1bda2073 100644
--- a/lib/common.h
+++ b/lib/common.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "config.h"
 #include "integers.h"
 
 #include <array>
@@ -275,6 +276,12 @@ struct Atomic : std::atomic<T> {
   }
 };
 
+//
+// mimalloc.cc
+//
+
+void set_mimalloc_options();
+
 //
 // perf.cc
 //
@@ -339,7 +346,7 @@ class Timer {
 public:
   Timer(Context &ctx, std::string name, Timer *parent = nullptr) {
     record = new TimerRecord(name, parent ? parent->record : nullptr);
-    ctx.timer_records.push_back(std::unique_ptr<TimerRecord>(record));
+    ctx.timer_records.emplace_back(record);
   }
 
   Timer(const Timer &) = delete;
@@ -356,23 +363,6 @@ class Timer {
   TimerRecord *record;
 };
 
-//
-// Bit vector
-//
-
-class BitVector {
-public:
-  BitVector() = default;
-  BitVector(u32 size) : vec((size + 7) / 8) {}
-
-  void resize(u32 size) { vec.resize((size + 7) / 8); }
-  bool get(u32 idx) const { return vec[idx / 8] & (1 << (idx % 8)); }
-  void set(u32 idx) { vec[idx / 8] |= 1 << (idx % 8); }
-
-private:
-  std::vector<u8> vec;
-};
-
 //
 // Utility functions
 //
@@ -410,8 +400,14 @@ inline u64 bits(u64 val, u64 hi, u64 lo) {
   return (val >> lo) & ((1LL << (hi - lo + 1)) - 1);
 }
 
-inline i64 sign_extend(u64 val, i64 size) {
-  return (i64)(val << (63 - size)) >> (63 - size);
+// Cast val to a signed N bit integer.
+// For example, sign_extend(x, 32) == (i32)x for any integer x.
+inline i64 sign_extend(u64 val, i64 n) {
+  return (i64)(val << (64 - n)) >> (64 - n);
+}
+
+inline bool is_int(u64 val, i64 n) {
+  return sign_extend(val, n) == val;
 }
 
 template <typename T, typename Compare = std::less<T>>
@@ -616,14 +612,14 @@ class ConcurrentMap {
 #endif
   }
 
-  std::pair<T *, bool> insert(std::string_view key, u32 hash, const T &val) {
+  std::pair<T *, bool> insert(std::string_view key, u64 hash, const T &val) {
     assert(has_single_bit(nbuckets));
 
-    i64 begin = hash & (nbuckets - 1);
+    u64 begin = hash & (nbuckets - 1);
     u64 mask = nbuckets / NUM_SHARDS - 1;
 
     for (i64 i = 0; i < MAX_RETRY; i++) {
-      i64 idx = (begin & ~mask) | ((begin + i) & mask);
+      u64 idx = (begin & ~mask) | ((begin + i) & mask);
       Entry &ent = entries[idx];
 
       // It seems avoiding compare-and-swap is faster overall at least
@@ -661,8 +657,8 @@ class ConcurrentMap {
         return {&ent.value, false};
     }
 
-    assert(false && "ConcurrentMap is full");
-    return {nullptr, false};
+    std::cerr << "ConcurrentMap is full\n";
+    abort();
   }
 
   i64 get_idx(T *value) const {
@@ -720,12 +716,12 @@ class ConcurrentMap {
     return flatten(vec);
   }
 
-  static constexpr i64 MIN_NBUCKETS = 2048;
+  static constexpr i64 MIN_NBUCKETS = 4096;
   static constexpr i64 NUM_SHARDS = 16;
-  static constexpr i64 MAX_RETRY = 128;
+  static constexpr i64 MAX_RETRY = 256;
 
   Entry *entries = nullptr;
-  i64 nbuckets = 0;
+  u64 nbuckets = 0;
 };
 
 //
@@ -929,7 +925,7 @@ class MappedFile {
     mf->size = size;
     mf->parent = this;
 
-    ctx.mf_pool.push_back(std::unique_ptr<MappedFile>(mf));
+    ctx.mf_pool.emplace_back(mf);
     return mf;
   }
 
@@ -993,7 +989,7 @@ MappedFile *open_file(Context &ctx, std::string path) {
 
   // Use `unique_ptr` to manage memory resources.
   if (mf)
-    ctx.mf_pool.push_back(std::unique_ptr<MappedFile>(mf));
+    ctx.mf_pool.emplace_back(mf);
   return mf;
 }
 
diff --git a/lib/config.h.in b/lib/config.h.in
index be3eec8db0..492972cf69 100644
--- a/lib/config.h.in
+++ b/lib/config.h.in
@@ -2,3 +2,6 @@
 #define MOLD_LIBDIR "@CMAKE_INSTALL_FULL_LIBDIR@"
 
 #cmakedefine HAVE_MADVISE 1
+#cmakedefine HAVE_UNAME 1
+#cmakedefine MOLD_USE_MIMALLOC 1
+#cmakedefine MOLD_USE_SYSTEM_MIMALLOC 1
diff --git a/lib/gentoo-test.sh b/lib/gentoo-test.sh
index dbdae00686..0881a3d916 100755
--- a/lib/gentoo-test.sh
+++ b/lib/gentoo-test.sh
@@ -1,14 +1,14 @@
 #!/bin/bash
 #
 # This test script takes a Gentoo package name and tries to build it
-# using mold in a Docker environment. We chose Gentoo Linux as a test
+# using mold in a Podman environment. We chose Gentoo Linux as a test
 # target, because its source-based package allows us to build programs
 # locally and run their test suites without any hassle.
 #
 # You can get a complete list of Gentoo packages availalbe for testing
 # with the following command:
 #
-# docker run --rm mold-gentoo emerge --color n -s '' | \
+# podman run --rm mold-gentoo emerge --color n -s '' | \
 #   perl -ne 'next unless m!^\*\s+(\S+/\S+)!; print "$1\n"'
 
 package="$1"
@@ -20,48 +20,48 @@ fi
 
 set -x
 
-# Create a Docker image
-if ! docker image ls mold-gentoo | grep -q mold-gentoo; then
+# Create a Podman image
+if ! podman image ls mold-gentoo | grep -q mold-gentoo; then
   set -e
-  cat <<EOF | docker build -t mold-gentoo -
-FROM gentoo/stage3
+  cat <<EOF | podman build -t mold-gentoo -
+FROM docker.io/gentoo/stage3
 RUN emerge-webrsync
 RUN echo 'USE="X ssl elogind -systemd corefonts truetype jpeg jpeg2k tiff zstd static-libs binary -perl"' >> /etc/portage/make.conf && \
     echo 'ACCEPT_KEYWORDS="~amd64"' >> /etc/portage/make.conf && \
     echo 'ACCEPT_LICENSE="* -@EULA"' >> /etc/portage/make.conf && \
     echo 'FEATURES="\${FEATURE} noclean nostrip ccache -ipc-sandbox -network-sandbox -pid-sandbox -sandbox"' >> /etc/portage/make.conf && \
     echo 'CCACHE_DIR="/ccache"' >> /etc/portage/make.conf && \
-    emerge gdb lld clang vim emacs strace ccache xeyes dev-build/cmake dev-vcs/git && \
+    emerge gdb lld llvm-core/clang vim emacs strace ccache xeyes dev-build/cmake dev-vcs/git && \
     rm -rf /var/tmp/portage
 EOF
   set +e
 fi
 
-git_hash=$(./mold --version | perl -ne '/\((\w+)/; print $1;')
+git_hash=$(./dist/mold --version | perl -ne '/\((\w+)/; print $1;')
 
 if [ "$package" = dev-libs/concurrencykit ]; then
   echo "Skipping known broken package: $package"
   exit 0
 fi
 
-# Build a given package in Docker
-cmd1='(cd /usr/bin; ln -sf /mold/mold $(realpath ld))'
+# Build a given package in Podman
+cmd1='(cd /usr/bin; ln -sf /mold/dist/mold $(realpath ld))'
 cmd2="MAKEOPTS=-'j$(nproc) --load-average=100' emerge --onlydeps $package"
 cmd3="MAKEOPTS='-j$(nproc) --load-average=100' FEATURES=test emerge $package"
 filename=`echo "$package" | sed 's!/!_!g'`
-docker="docker run --rm --cap-add=SYS_PTRACE -v `pwd`:/mold -v /var/cache/ccache-gentoo:/ccache mold-gentoo timeout -v -k 15s 3h"
+podman="podman run --rm --pids-limit=-1 --cap-add=SYS_PTRACE -v `pwd`:/mold:ro -v /var/cache/ccache-gentoo:/ccache mold-gentoo timeout -v -k 15s 3h"
 dir=gentoo/$git_hash
 
 mkdir -p "$dir"/success "$dir"/failure
 
-$docker nice -n 19 bash -c "$cmd1 && $cmd2 && $cmd3" >& "$dir"/"$filename".mold
+$podman nice -n 19 bash -c "$cmd1 && $cmd2 && $cmd3" >& "$dir"/"$filename".mold
 if [ $? = 0 ]; then
   mv "$dir"/"$filename".mold "$dir"/success
 else
   mv "$dir"/"$filename".mold "$dir"/failure
 fi
 
-$docker nice -n 19 bash -c "$cmd2 && $cmd3" >& "$dir"/"$filename".ld
+$podman nice -n 19 bash -c "$cmd2 && $cmd3" >& "$dir"/"$filename".ld
 if [ $? = 0 ]; then
   mv "$dir"/"$filename".ld "$dir"/success
 else
diff --git a/lib/glob.cc b/lib/glob.cc
index 25db567c33..942c68e9bc 100644
--- a/lib/glob.cc
+++ b/lib/glob.cc
@@ -19,7 +19,7 @@ std::optional<Glob> Glob::compile(std::string_view pat) {
       // [$\]!]: $, ] or !
       // [a-czg-i]: a, b, c, z, g, h, or i
       // [^a-z]: Any character except lowercase letters
-      vec.push_back({BRACKET});
+      vec.emplace_back(BRACKET);
       std::bitset<256> &bitset = vec.back().bitset;
 
       bool negate = false;
@@ -77,22 +77,22 @@ std::optional<Glob> Glob::compile(std::string_view pat) {
       break;
     }
     case '?':
-      vec.push_back({QUESTION});
+      vec.emplace_back(QUESTION);
       break;
     case '*':
-      vec.push_back({STAR});
+      vec.emplace_back(STAR);
       break;
     case '\\':
       if (pat.empty())
         return {};
       if (vec.empty() || vec.back().kind != STRING)
-        vec.push_back({STRING});
+        vec.emplace_back(STRING);
       vec.back().str += pat[0];
       pat = pat.substr(1);
       break;
     default:
       if (vec.empty() || vec.back().kind != STRING)
-        vec.push_back({STRING});
+        vec.emplace_back(STRING);
       vec.back().str += c;
       break;
     }
diff --git a/lib/jobs-unix.cc b/lib/jobs-unix.cc
index 9912ab52c4..ac453b015b 100644
--- a/lib/jobs-unix.cc
+++ b/lib/jobs-unix.cc
@@ -11,9 +11,9 @@
 
 #include "common.h"
 
+#include <cstdlib>
 #include <fcntl.h>
 #include <pwd.h>
-#include <stdlib.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
diff --git a/lib/mimalloc.cc b/lib/mimalloc.cc
index a97a028223..823b91b8e9 100644
--- a/lib/mimalloc.cc
+++ b/lib/mimalloc.cc
@@ -1 +1,24 @@
-#include <mimalloc-new-delete.h>
+#include "common.h"
+
+// Including mimalloc-new-delete.h overrides new/delete operators.
+// We need it only when we are using mimalloc as a dynamic library.
+#if MOLD_USE_SYSTEM_MIMALLOC
+# include <mimalloc-new-delete.h>
+#endif
+
+// Silence mimalloc warning messages that users can just ignore.
+#if MOLD_USE_MIMALLOC
+# include <mimalloc.h>
+
+namespace mold {
+void set_mimalloc_options() {
+  mi_option_disable(mi_option_verbose);
+  mi_option_disable(mi_option_show_errors);
+}
+}
+
+#else
+namespace mold {
+void set_mimalloc_options() {}
+}
+#endif
diff --git a/lib/multi-glob.cc b/lib/multi-glob.cc
index 56b289e526..047acd4239 100644
--- a/lib/multi-glob.cc
+++ b/lib/multi-glob.cc
@@ -106,12 +106,12 @@ bool MultiGlob::add(std::string_view pat, i64 val) {
   assert(!is_compiled);
   assert(!pat.empty());
 
-  strings.push_back(std::string(pat));
+  strings.emplace_back(pat);
 
   // Complex glob pattern
   if (!is_simple_pattern(pat)) {
     if (std::optional<Glob> glob = Glob::compile(pat)) {
-      globs.push_back({std::move(*glob), val});
+      globs.emplace_back(std::move(*glob), val);
       return true;
     }
     return false;
diff --git a/lib/perf.cc b/lib/perf.cc
index cf053b0846..8504f37e2c 100644
--- a/lib/perf.cc
+++ b/lib/perf.cc
@@ -3,6 +3,7 @@
 #include <functional>
 #include <iomanip>
 #include <ios>
+#include <tbb/concurrent_vector.h>
 
 #ifndef _WIN32
 #include <sys/resource.h>
diff --git a/lib/signal-unix.cc b/lib/signal-unix.cc
index 85cec3ee01..e429232e0f 100644
--- a/lib/signal-unix.cc
+++ b/lib/signal-unix.cc
@@ -1,6 +1,6 @@
 #include "common.h"
 
-#include <signal.h>
+#include <csignal>
 #include <tbb/version.h>
 
 #ifdef __FreeBSD__
diff --git a/src/arch-arm32.cc b/src/arch-arm32.cc
index 86c804d439..0ca209b0e7 100644
--- a/src/arch-arm32.cc
+++ b/src/arch-arm32.cc
@@ -34,8 +34,6 @@
 #include "mold.h"
 
 #include <tbb/parallel_for.h>
-#include <tbb/parallel_for_each.h>
-#include <tbb/parallel_sort.h>
 
 namespace mold {
 
@@ -43,6 +41,9 @@ using E = ARM32;
 
 template <>
 i64 get_addend(u8 *loc, const ElfRel<E> &rel) {
+  ul32 *arm = (ul32 *)loc;
+  ul16 *thm = (ul16 *)loc;
+
   switch (rel.r_type) {
   case R_ARM_ABS32:
   case R_ARM_REL32:
@@ -57,76 +58,100 @@ i64 get_addend(u8 *loc, const ElfRel<E> &rel) {
   case R_ARM_TLS_LE32:
   case R_ARM_TLS_GOTDESC:
   case R_ARM_TARGET2:
-    return *(il32 *)loc;
+    return (il32)*arm;
   case R_ARM_THM_JUMP11:
-    return sign_extend(*(ul16 *)loc, 10) << 1;
+    return sign_extend(thm[0], 11) << 1;
+  case R_ARM_THM_JUMP19: {
+    // https://developer.arm.com/documentation/ddi0597/2024-12/Base-Instructions/B--Branch-
+    u32 S = bit(thm[0], 10);
+    u32 J2 = bit(thm[1], 11);
+    u32 J1 = bit(thm[1], 13);
+    u32 imm6 = bits(thm[0], 5, 0);
+    u32 imm11 = bits(thm[1], 10, 0);
+    u32 val = (S << 20) | (J2 << 19) | (J1 << 18) | (imm6 << 12) | (imm11 << 1);
+    return sign_extend(val, 21);
+  }
   case R_ARM_THM_CALL:
   case R_ARM_THM_JUMP24:
   case R_ARM_THM_TLS_CALL: {
-    u32 S = bit(*(ul16 *)loc, 10);
-    u32 J1 = bit(*(ul16 *)(loc + 2), 13);
-    u32 J2 = bit(*(ul16 *)(loc + 2), 11);
+    // https://developer.arm.com/documentation/ddi0597/2024-12/Base-Instructions/BL--BLX--immediate---Branch-with-Link-and-optional-Exchange--immediate--
+    u32 S = bit(thm[0], 10);
+    u32 J1 = bit(thm[1], 13);
+    u32 J2 = bit(thm[1], 11);
     u32 I1 = !(J1 ^ S);
     u32 I2 = !(J2 ^ S);
-    u32 imm10 = bits(*(ul16 *)loc, 9, 0);
-    u32 imm11 = bits(*(ul16 *)(loc + 2), 10, 0);
+    u32 imm10 = bits(thm[0], 9, 0);
+    u32 imm11 = bits(thm[1], 10, 0);
     u32 val = (S << 24) | (I1 << 23) | (I2 << 22) | (imm10 << 12) | (imm11 << 1);
-    return sign_extend(val, 24);
+    return sign_extend(val, 25);
   }
   case R_ARM_CALL:
   case R_ARM_JUMP24:
   case R_ARM_PLT32:
   case R_ARM_TLS_CALL:
-    return sign_extend(*(ul32 *)loc, 23) << 2;
+    return sign_extend(*arm, 24) << 2;
   case R_ARM_MOVW_PREL_NC:
   case R_ARM_MOVW_ABS_NC:
   case R_ARM_MOVT_PREL:
   case R_ARM_MOVT_ABS: {
-    u32 imm12 = bits(*(ul32 *)loc, 11, 0);
-    u32 imm4 = bits(*(ul32 *)loc, 19, 16);
-    return sign_extend((imm4 << 12) | imm12, 15);
+    // https://developer.arm.com/documentation/ddi0597/2024-12/Base-Instructions/MOV--MOVS--immediate---Move--immediate--
+    u32 imm4 = bits(*arm, 19, 16);
+    u32 imm12 = bits(*arm, 11, 0);
+    u32 val = (imm4 << 12) | imm12;
+    return sign_extend(val, 16);
   }
   case R_ARM_PREL31:
-    return sign_extend(*(ul32 *)loc, 30);
+    return sign_extend(*arm, 31);
   case R_ARM_THM_MOVW_PREL_NC:
   case R_ARM_THM_MOVW_ABS_NC:
   case R_ARM_THM_MOVT_PREL:
   case R_ARM_THM_MOVT_ABS: {
-    u32 imm4 = bits(*(ul16 *)loc, 3, 0);
-    u32 i = bit(*(ul16 *)loc, 10);
-    u32 imm3 = bits(*(ul16 *)(loc + 2), 14, 12);
-    u32 imm8 = bits(*(ul16 *)(loc + 2), 7, 0);
+    // https://developer.arm.com/documentation/ddi0597/2024-12/Base-Instructions/MOVT--Move-Top-
+    u32 imm4 = bits(thm[0], 3, 0);
+    u32 i = bit(thm[0], 10);
+    u32 imm3 = bits(thm[1], 14, 12);
+    u32 imm8 = bits(thm[1], 7, 0);
     u32 val = (imm4 << 12) | (i << 11) | (imm3 << 8) | imm8;
-    return sign_extend(val, 15);
+    return sign_extend(val, 16);
   }
   default:
     return 0;
   }
 }
 
-static void write_mov_imm(u8 *loc, u32 val) {
+static void write_arm_mov(u8 *loc, u32 val) {
   u32 imm12 = bits(val, 11, 0);
   u32 imm4 = bits(val, 15, 12);
-  *(ul32 *)loc = (*(ul32 *)loc & 0xfff0f000) | (imm4 << 16) | imm12;
+  *(ul32 *)loc = (*(ul32 *)loc & 0xfff0'f000) | (imm4 << 16) | imm12;
+}
+
+static void write_thm_b21(u8 *loc, u32 val) {
+  u32 S = bit(val, 20);
+  u32 J2 = bit(val, 19);
+  u32 J1 = bit(val, 18);
+  u32 imm6 = bits(val, 17, 12);
+  u32 imm11 = bits(val, 11, 1);
+
+  ul16 *buf = (ul16 *)loc;
+  buf[0] = (buf[0] & 0b1111'1011'1100'0000) | (S << 10) | imm6;
+  buf[1] = (buf[1] & 0b1101'0000'0000'0000) | (J1 << 13) | (J2 << 11) | imm11;
 }
 
-static void write_thm_b_imm(u8 *loc, u32 val) {
-  // https://developer.arm.com/documentation/ddi0406/cb/Application-Level-Architecture/Instruction-Details/Alphabetical-list-of-instructions/BL--BLX--immediate-
-  u32 sign = bit(val, 24);
+static void write_thm_b25(u8 *loc, u32 val) {
+  u32 S = bit(val, 24);
   u32 I1 = bit(val, 23);
   u32 I2 = bit(val, 22);
-  u32 J1 = !I1 ^ sign;
-  u32 J2 = !I2 ^ sign;
+  u32 J1 = !I1 ^ S;
+  u32 J2 = !I2 ^ S;
   u32 imm10 = bits(val, 21, 12);
   u32 imm11 = bits(val, 11, 1);
 
   ul16 *buf = (ul16 *)loc;
-  buf[0] = (buf[0] & 0b1111'1000'0000'0000) | (sign << 10) | imm10;
+  buf[0] = (buf[0] & 0b1111'1000'0000'0000) | (S << 10) | imm10;
   buf[1] = (buf[1] & 0b1101'0000'0000'0000) | (J1 << 13) | (J2 << 11) | imm11;
 }
 
-static void write_thm_mov_imm(u8 *loc, u32 val) {
-  // https://developer.arm.com/documentation/ddi0406/cb/Application-Level-Architecture/Instruction-Details/Alphabetical-list-of-instructions/MOVT
+static void write_thm_mov(u8 *loc, u32 val) {
   u32 imm4 = bits(val, 15, 12);
   u32 i = bit(val, 11);
   u32 imm3 = bits(val, 10, 8);
@@ -163,7 +188,7 @@ void write_addend(u8 *loc, i64 val, const ElfRel<E> &rel) {
   case R_ARM_THM_CALL:
   case R_ARM_THM_JUMP24:
   case R_ARM_THM_TLS_CALL:
-    write_thm_b_imm(loc, val);
+    write_thm_b25(loc, val);
     break;
   case R_ARM_CALL:
   case R_ARM_JUMP24:
@@ -174,7 +199,7 @@ void write_addend(u8 *loc, i64 val, const ElfRel<E> &rel) {
   case R_ARM_MOVW_ABS_NC:
   case R_ARM_MOVT_PREL:
   case R_ARM_MOVT_ABS:
-    write_mov_imm(loc, val);
+    write_arm_mov(loc, val);
     break;
   case R_ARM_PREL31:
     *(ul32 *)loc = (*(ul32 *)loc & 0x8000'0000) | (val & 0x7fff'ffff);
@@ -183,7 +208,7 @@ void write_addend(u8 *loc, i64 val, const ElfRel<E> &rel) {
   case R_ARM_THM_MOVW_ABS_NC:
   case R_ARM_THM_MOVT_PREL:
   case R_ARM_THM_MOVT_ABS:
-    write_thm_mov_imm(loc, val);
+    write_thm_mov(loc, val);
     break;
   default:
     unreachable();
@@ -207,7 +232,7 @@ void write_plt_header(Context<E> &ctx, u8 *buf) {
   *(ul32 *)(buf + 16) = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 16;
 }
 
-constexpr ul32 plt_entry[] = {
+static constexpr ul32 plt_entry[] = {
   0xe59f'c004, // 1: ldr ip, 2f
   0xe08c'c00f, //    add ip, ip, pc
   0xe59c'f000, //    ldr pc, [ip]
@@ -245,25 +270,18 @@ void EhFrameSection<E>::apply_eh_reloc(Context<E> &ctx, const ElfRel<E> &rel,
   }
 }
 
-// ARM and Thumb branch instructions can jump within ±16 MiB.
-static bool is_jump_reachable(i64 val) {
-  return sign_extend(val, 24) == val;
+static Thunk<E> &get_reachable_thunk(OutputSection<E> &osec, u64 addr) {
+  auto it = std::upper_bound(osec.thunks.begin(), osec.thunks.end(), addr,
+                             [](u64 addr, std::unique_ptr<Thunk<E>> &thunk) {
+    return addr < thunk->get_addr();
+  });
+  return **it;
 }
 
 template <>
 void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
   std::span<const ElfRel<E>> rels = get_rels(ctx);
 
-  auto get_tls_trampoline_addr = [&, i = 0](u64 addr) mutable {
-    for (; i < output_section->thunks.size(); i++) {
-      i64 disp = output_section->shdr.sh_addr + output_section->thunks[i]->offset -
-                 addr;
-      if (is_jump_reachable(disp))
-        return disp;
-    }
-    unreachable();
-  };
-
   for (i64 i = 0; i < rels.size(); i++) {
     const ElfRel<E> &rel = rels[i];
     if (rel.r_type == R_NONE || rel.r_type == R_ARM_V4BX)
@@ -286,8 +304,12 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
     u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
     u64 GOT = ctx.got->shdr.sh_addr;
 
-    auto get_thumb_thunk_addr = [&] { return get_thunk_addr(i); };
-    auto get_arm_thunk_addr   = [&] { return get_thunk_addr(i) + 4; };
+    auto get_thumb_thunk_addr = [&] { return sym.get_thunk_addr(ctx, P); };
+    auto get_arm_thunk_addr   = [&] { return sym.get_thunk_addr(ctx, P) + 4; };
+
+    auto get_tlsdesc_trampoline_addr = [&] {
+      return get_reachable_thunk(*output_section, P).get_addr();
+    };
 
     switch (rel.r_type) {
     case R_ARM_ABS32:
@@ -303,21 +325,21 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
         break;
       }
 
-      // THM_CALL relocation refers either BL or BLX instruction.
+      // THM_CALL relocation refers to either BL or BLX instruction.
       // They are different in only one bit. We need to use BL if
       // the jump target is Thumb. Otherwise, use BLX.
-      i64 val = S + A - P;
-      if (is_jump_reachable(val)) {
-        if (T) {
-          write_thm_b_imm(loc, val);
-          *(ul16 *)(loc + 2) |= 0x1000;  // rewrite to BL
-        } else {
-          write_thm_b_imm(loc, align_to(val, 4));
-          *(ul16 *)(loc + 2) &= ~0x1000; // rewrite to BLX
-        }
+      i64 val1 = S + A - P;
+      i64 val2 = align_to(S + A - P, 4);
+
+      if (T && is_int(val1, 25)) {
+        *(ul16 *)(loc + 2) |= 0x1000;  // BL
+        write_thm_b25(loc, val1);
+      } else if (!T && is_int(val2, 25)) {
+        *(ul16 *)(loc + 2) &= ~0x1000; // BLX
+        write_thm_b25(loc, val2);
       } else {
-        write_thm_b_imm(loc, align_to(get_arm_thunk_addr() + A - P, 4));
-        *(ul16 *)(loc + 2) &= ~0x1000;  // rewrite to BLX
+        *(ul16 *)(loc + 2) |= 0x1000;  // BL
+        write_thm_b25(loc, get_thumb_thunk_addr() + A - P);
       }
       break;
     }
@@ -347,8 +369,8 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       if (!is_bl && !is_blx)
         Fatal(ctx) << *this << ": R_ARM_CALL refers to neither BL nor BLX";
 
-      u64 val = S + A - P;
-      if (is_jump_reachable(val)) {
+      i64 val = S + A - P;
+      if (is_int(val, 26)) {
         if (T) {
           *(ul32 *)loc = 0xfa00'0000; // BLX
           *(ul32 *)loc |= (bit(val, 1) << 24) | bits(val, 25, 2);
@@ -374,8 +396,8 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       // immediate; it takes only a register. So if mode switch is
       // required, we jump to a linker-synthesized thunk which does the
       // job with a longer code sequence.
-      u64 val = S + A - P;
-      if (!is_jump_reachable(val) || T)
+      i64 val = S + A - P;
+      if (T || !is_int(val, 26))
         val = get_arm_thunk_addr() + A - P;
       *(ul32 *)loc = (*(ul32 *)loc & 0xff00'0000) | bits(val, 25, 2);
       break;
@@ -389,29 +411,14 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       }
       break;
     case R_ARM_THM_JUMP11:
-      assert(T);
       check(S + A - P, -(1 << 11), 1 << 11);
       *(ul16 *)loc &= 0xf800;
       *(ul16 *)loc |= bits(S + A - P, 11, 1);
       break;
-    case R_ARM_THM_JUMP19: {
-      i64 val = S + A - P;
-      check(val, -(1 << 19), 1 << 19);
-
-      // sign:J2:J1:imm6:imm11:'0'
-      u32 sign = bit(val, 20);
-      u32 J2 = bit(val, 19);
-      u32 J1 = bit(val, 18);
-      u32 imm6 = bits(val, 17, 12);
-      u32 imm11 = bits(val, 11, 1);
-
-      *(ul16 *)loc &= 0b1111'1011'1100'0000;
-      *(ul16 *)loc |= (sign << 10) | imm6;
-
-      *(ul16 *)(loc + 2) &= 0b1101'0000'0000'0000;
-      *(ul16 *)(loc + 2) |= (J2 << 13) | (J1 << 11) | imm11;
+    case R_ARM_THM_JUMP19:
+      check(S + A - P, -(1 << 20), 1 << 20);
+      write_thm_b21(loc, S + A - P);
       break;
-    }
     case R_ARM_THM_JUMP24: {
       if (sym.is_remaining_undef_weak()) {
         *(ul32 *)loc = 0x8000'f3af; // NOP
@@ -420,20 +427,20 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
 
       // Just like R_ARM_JUMP24, we need to jump to a thunk if we need to
       // switch processor mode.
-      u64 val = S + A - P;
-      if (!is_jump_reachable(val) || !T)
+      i64 val = S + A - P;
+      if (!T || !is_int(val, 25))
         val = get_thumb_thunk_addr() + A - P;
-      write_thm_b_imm(loc, val);
+      write_thm_b25(loc, val);
       break;
     }
     case R_ARM_MOVW_PREL_NC:
-      write_mov_imm(loc, ((S + A) | T) - P);
+      write_arm_mov(loc, ((S + A) | T) - P);
       break;
     case R_ARM_MOVW_ABS_NC:
-      write_mov_imm(loc, (S + A) | T);
+      write_arm_mov(loc, (S + A) | T);
       break;
     case R_ARM_THM_MOVW_PREL_NC:
-      write_thm_mov_imm(loc, ((S + A) | T) - P);
+      write_thm_mov(loc, ((S + A) | T) - P);
       break;
     case R_ARM_PREL31:
       check(S + A - P, -(1LL << 30), 1LL << 30);
@@ -441,19 +448,19 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       *(ul32 *)loc |= (S + A - P) & 0x7fff'ffff;
       break;
     case R_ARM_THM_MOVW_ABS_NC:
-      write_thm_mov_imm(loc, (S + A) | T);
+      write_thm_mov(loc, (S + A) | T);
       break;
     case R_ARM_MOVT_PREL:
-      write_mov_imm(loc, (S + A - P) >> 16);
+      write_arm_mov(loc, (S + A - P) >> 16);
       break;
     case R_ARM_THM_MOVT_PREL:
-      write_thm_mov_imm(loc, (S + A - P) >> 16);
+      write_thm_mov(loc, (S + A - P) >> 16);
       break;
     case R_ARM_MOVT_ABS:
-      write_mov_imm(loc, (S + A) >> 16);
+      write_arm_mov(loc, (S + A) >> 16);
       break;
     case R_ARM_THM_MOVT_ABS:
-      write_thm_mov_imm(loc, (S + A) >> 16);
+      write_thm_mov(loc, (S + A) >> 16);
       break;
     case R_ARM_TLS_GD32:
       *(ul32 *)loc = sym.get_tlsgd_addr(ctx) + A - P;
@@ -506,8 +513,8 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       break;
     case R_ARM_TLS_CALL:
       if (sym.has_tlsdesc(ctx)) {
-        // BL <tls_trampoline>
-        *(ul32 *)loc = 0xeb00'0000 | bits(get_tls_trampoline_addr(P + 8), 25, 2);
+        *(ul32 *)loc = 0xeb00'0000; // bl 0
+        *(ul32 *)loc |= bits(get_tlsdesc_trampoline_addr() - P - 8, 25, 2);
       } else if (sym.has_gottp(ctx)) {
         *(ul32 *)loc = 0xe79f'0000; // ldr r0, [pc, r0]
       } else {
@@ -516,8 +523,8 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       break;
     case R_ARM_THM_TLS_CALL:
       if (sym.has_tlsdesc(ctx)) {
-        u64 val = align_to(get_tls_trampoline_addr(P + 4), 4);
-        write_thm_b_imm(loc, val);
+        u64 val = align_to(get_tlsdesc_trampoline_addr() - P - 4, 4);
+        write_thm_b25(loc, val);
         *(ul16 *)(loc + 2) &= ~0x1000; // rewrite BL with BLX
       } else if (sym.has_gottp(ctx)) {
         // Since `ldr r0, [pc, r0]` is not representable in Thumb,
@@ -663,7 +670,7 @@ void Thunk<E>::copy_buf(Context<E> &ctx) {
 
   // This is a range extension and mode switch thunk.
   // It has two entry points: +0 for Thumb and +4 for ARM.
-  static const u8 entry[] = {
+  constexpr u8 entry[] = {
     // .thumb
     0x78, 0x47,             //    bx   pc  # jumps to 1f
     0xc0, 0x46,             //    nop
@@ -715,7 +722,7 @@ void create_arm_exidx_section(Context<E> &ctx) {
 
 void Arm32ExidxSection::compute_section_size(Context<E> &ctx) {
   output_section.compute_section_size(ctx);
-  this->shdr.sh_size = output_section.shdr.sh_size;
+  this->shdr.sh_size = output_section.shdr.sh_size + 8; // +8 for sentinel
 }
 
 void Arm32ExidxSection::update_shdr(Context<E> &ctx) {
@@ -731,10 +738,19 @@ void Arm32ExidxSection::remove_duplicate_entries(Context<E> &ctx) {
 
 void Arm32ExidxSection::copy_buf(Context<E> &ctx) {
   std::vector<u8> contents = get_contents(ctx);
-  assert(this->shdr.sh_size = contents.size());
+  assert(this->shdr.sh_size == contents.size());
   write_vector(ctx.buf + this->shdr.sh_offset, contents);
 }
 
+// Returns the end of the text segment
+static u64 get_text_end(Context<E> &ctx) {
+  u64 ret = 0;
+  for (Chunk<E> *chunk : ctx.chunks)
+    if (chunk->shdr.sh_flags & SHF_EXECINSTR)
+      ret = std::max<u64>(ret, chunk->shdr.sh_addr + chunk->shdr.sh_size);
+  return ret;
+}
+
 // ARM executables use an .ARM.exidx section to look up an exception
 // handling record for the current instruction pointer. The table needs
 // to be sorted by their addresses.
@@ -745,11 +761,6 @@ void Arm32ExidxSection::copy_buf(Context<E> &ctx) {
 //
 // This function returns contents of .ARM.exidx.
 std::vector<u8> Arm32ExidxSection::get_contents(Context<E> &ctx) {
-  std::vector<u8> buf(output_section.shdr.sh_size);
-
-  output_section.shdr.sh_addr = this->shdr.sh_addr;
-  output_section.write_to(ctx, buf.data(), nullptr);
-
   // .ARM.exidx records consists of a signed 31-bit relative address
   // and a 32-bit value. The relative address indicates the start
   // address of a function that the record covers. The value is one of
@@ -769,11 +780,19 @@ std::vector<u8> Arm32ExidxSection::get_contents(Context<E> &ctx) {
     ul32 val;
   };
 
-  if (buf.size() % sizeof(Entry))
-    Fatal(ctx) << "invalid .ARM.exidx section size";
-
+  // We reserve one extra slot for the sentinel
+  i64 num_entries = output_section.shdr.sh_size / sizeof(Entry) + 1;
+  std::vector<u8> buf(num_entries * sizeof(Entry));
   Entry *ent = (Entry *)buf.data();
-  i64 num_entries = buf.size() / sizeof(Entry);
+
+  // Write section contents to the buffer
+  output_section.shdr.sh_addr = this->shdr.sh_addr;
+  output_section.write_to(ctx, buf.data());
+
+  // Fill in sentinel fields
+  u64 sentinel_addr = this->shdr.sh_addr + sizeof(Entry) * (num_entries - 1);
+  ent[num_entries - 1].addr = get_text_end(ctx) - sentinel_addr;
+  ent[num_entries - 1].val = CANTUNWIND;
 
   // Entry's addresses are relative to themselves. In order to sort
   // records by address, we first translate them so that the addresses
@@ -784,7 +803,7 @@ std::vector<u8> Arm32ExidxSection::get_contents(Context<E> &ctx) {
 
   tbb::parallel_for((i64)0, num_entries, [&](i64 i) {
     i64 offset = sizeof(Entry) * i;
-    ent[i].addr = sign_extend(ent[i].addr, 30) + offset;
+    ent[i].addr = sign_extend(ent[i].addr, 31) + offset;
     if (is_relative(ent[i].val))
       ent[i].val = 0x7fff'ffff & (ent[i].val + offset);
   });
diff --git a/src/arch-arm64.cc b/src/arch-arm64.cc
index 6fc237b88b..d04edb2920 100644
--- a/src/arch-arm64.cc
+++ b/src/arch-arm64.cc
@@ -17,11 +17,13 @@
 //
 // https://github.com/ARM-software/abi-aa/blob/main/aaelf64/aaelf64.rst
 
+#if MOLD_ARM64LE || MOLD_ARM64BE
+
 #include "mold.h"
 
 namespace mold {
 
-using E = ARM64;
+using E = MOLD_TARGET;
 
 static void write_adrp(u8 *buf, u64 val) {
   *(ul32 *)buf |= (bits(val, 13, 12) << 29) | (bits(val, 32, 14) << 5);
@@ -110,13 +112,13 @@ void EhFrameSection<E>::apply_eh_reloc(Context<E> &ctx, const ElfRel<E> &rel,
   case R_NONE:
     break;
   case R_AARCH64_ABS64:
-    *(ul64 *)loc = val;
+    *(U64<E> *)loc = val;
     break;
   case R_AARCH64_PREL32:
-    *(ul32 *)loc = val - this->shdr.sh_addr - offset;
+    *(U32<E> *)loc = val - this->shdr.sh_addr - offset;
     break;
   case R_AARCH64_PREL64:
-    *(ul64 *)loc = val - this->shdr.sh_addr - offset;
+    *(U64<E> *)loc = val - this->shdr.sh_addr - offset;
     break;
   default:
     Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
@@ -236,7 +238,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
           i + 1 < rels.size()) {
         i64 val = S + A - P - 4;
         const ElfRel<E> &rel2 = rels[i + 1];
-        if (sign_extend(val, 20) == val &&
+        if (is_int(val, 21) &&
             rel2.r_type == R_AARCH64_ADD_ABS_LO12_NC &&
             rel2.r_sym == rel.r_sym &&
             rel2.r_offset == rel.r_offset + 4 &&
@@ -275,14 +277,14 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       }
 
       i64 val = S + A - P;
-      if (val < -(1 << 27) || (1 << 27) <= val)
-        val = get_thunk_addr(i) + A - P;
+      if (!is_int(val, 28))
+        val = sym.get_thunk_addr(ctx, P) + A - P;
       *(ul32 *)loc |= bits(val, 27, 2);
       break;
     }
     case R_AARCH64_PLT32:
       check(S + A - P, -(1LL << 31), 1LL << 31);
-      *(ul32 *)loc = S + A - P;
+      *(U32<E> *)loc = S + A - P;
       break;
     case R_AARCH64_CONDBR19:
     case R_AARCH64_LD_PREL_LO19:
@@ -291,14 +293,14 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       break;
     case R_AARCH64_PREL16:
       check(S + A - P, -(1LL << 15), 1LL << 15);
-      *(ul16 *)loc = S + A - P;
+      *(U16<E> *)loc = S + A - P;
       break;
     case R_AARCH64_PREL32:
       check(S + A - P, -(1LL << 31), 1LL << 32);
-      *(ul32 *)loc = S + A - P;
+      *(U32<E> *)loc = S + A - P;
       break;
     case R_AARCH64_PREL64:
-      *(ul64 *)loc = S + A - P;
+      *(U64<E> *)loc = S + A - P;
       break;
     case R_AARCH64_LD64_GOT_LO12_NC:
       *(ul32 *)loc |= bits(G + GOT + A, 11, 3) << 10;
@@ -463,14 +465,14 @@ void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
     switch (rel.r_type) {
     case R_AARCH64_ABS64:
       if (std::optional<u64> val = get_tombstone(sym, frag))
-        *(ul64 *)loc = *val;
+        *(U64<E> *)loc = *val;
       else
-        *(ul64 *)loc = S + A;
+        *(U64<E> *)loc = S + A;
       break;
     case R_AARCH64_ABS32: {
       i64 val = S + A;
       check(val, 0, 1LL << 32);
-      *(ul32 *)loc = val;
+      *(U32<E> *)loc = val;
       break;
     }
     default:
@@ -619,3 +621,5 @@ void Thunk<E>::copy_buf(Context<E> &ctx) {
 }
 
 } // namespace mold
+
+#endif
diff --git a/src/arch-loongarch.cc b/src/arch-loongarch.cc
index dda138e99a..cb8abe5c39 100644
--- a/src/arch-loongarch.cc
+++ b/src/arch-loongarch.cc
@@ -51,7 +51,7 @@ static u64 hi20(u64 val, u64 pc) {
   return bits(page(val + 0x800) - page(pc), 31, 12);
 }
 
-static u64 higher20(u64 val, u64 pc) {
+static u64 hi32(u64 val, u64 pc) {
   // A PC-relative 64-bit address is materialized with the following
   // instructions for the large code model:
   //
@@ -68,12 +68,15 @@ static u64 higher20(u64 val, u64 pc) {
   // Compensating all the sign-extensions is a bit complicated. The
   // psABI gave the following formula.
   val = val + 0x8000'0000 + ((val & 0x800) ? (0x1000 - 0x1'0000'0000) : 0);
-  return bits(page(val) - page(pc - 8), 51, 32);
+  return page(val) - page(pc - 8);
+}
+
+static u64 higher20(u64 val, u64 pc) {
+  return bits(hi32(val, pc), 51, 32);
 }
 
 static u64 highest12(u64 val, u64 pc) {
-  val = val + 0x8000'0000 + ((val & 0x800) ? (0x1000 - 0x1'0000'0000) : 0);
-  return bits(page(val) - page(pc - 12), 63, 52);
+  return bits(hi32(val, pc), 63, 52);
 }
 
 static void write_k12(u8 *loc, u32 val) {
@@ -125,11 +128,12 @@ static void set_rj(u8 *loc, u32 rj) {
 // Returns true if isec's i'th relocation refers to the following
 // relaxable instructioon pair.
 //
-//   pcalau12i $t0, 0         # R_LARCH_GOT_PC_HI20
-//   ld.d      $t0, $t0, 0    # R_LARCH_GOT_PC_LO12
+//   pcalau12i $t0, 0         # R_LARCH_GOT_PC_HI20, R_LARCH_RELAX
+//   ld.d      $t0, $t0, 0    # R_LARCH_GOT_PC_LO12, R_LARCH_RELAX
 static bool is_relaxable_got_load(Context<E> &ctx, InputSection<E> &isec, i64 i) {
   std::span<const ElfRel<E>> rels = isec.get_rels(ctx);
   Symbol<E> &sym = *isec.file.symbols[rels[i].r_sym];
+  u8 *buf = (u8 *)isec.contents.data();
 
   if (ctx.arg.relax &&
       sym.is_pcrel_linktime_const(ctx) &&
@@ -137,8 +141,8 @@ static bool is_relaxable_got_load(Context<E> &ctx, InputSection<E> &isec, i64 i)
       rels[i + 2].r_type == R_LARCH_GOT_PC_LO12 &&
       rels[i + 2].r_offset == rels[i].r_offset + 4 &&
       rels[i + 3].r_type == R_LARCH_RELAX) {
-    u32 insn1 = *(ul32 *)(isec.contents.data() + rels[i].r_offset);
-    u32 insn2 = *(ul32 *)(isec.contents.data() + rels[i].r_offset + 4);
+    u32 insn1 = *(ul32 *)(buf + rels[i].r_offset);
+    u32 insn2 = *(ul32 *)(buf + rels[i].r_offset + 4);
     bool is_ld_d = (insn2 & 0xffc0'0000) == 0x28c0'0000;
     return get_rd(insn1) == get_rd(insn2) && get_rd(insn2) == get_rj(insn2) &&
            is_ld_d;
@@ -179,14 +183,14 @@ void write_plt_header<E>(Context<E> &ctx, u8 *buf) {
   write_k12(buf + 16, gotplt);
 }
 
-constexpr ul32 plt_entry_64[] = {
+static constexpr ul32 plt_entry_64[] = {
   0x1a00'000f, // pcalau12i $t3, %pc_hi20(func@.got.plt)
   0x28c0'01ef, // ld.d      $t3, $t3, %lo12(func@.got.plt)
   0x4c00'01ed, // jirl      $t1, $t3, 0
   0x002a'0000, // break
 };
 
-constexpr ul32 plt_entry_32[] = {
+static constexpr ul32 plt_entry_32[] = {
   0x1a00'000f, // pcalau12i $t3, %pc_hi20(func@.got.plt)
   0x2880'01ef, // ld.w      $t3, $t3, %lo12(func@.got.plt)
   0x4c00'01ed, // jirl      $t1, $t3, 0
@@ -265,10 +269,9 @@ void EhFrameSection<E>::apply_eh_reloc(Context<E> &ctx, const ElfRel<E> &rel,
 template <>
 void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
   std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  auto get_r_delta = [&](i64 idx) {
-    return extra.r_deltas.empty() ? 0 : extra.r_deltas[idx];
-  };
+  std::span<RelocDelta> deltas = extra.r_deltas;
+  i64 k = 0;
+  u8 *buf = (u8 *)contents.data();
 
   for (i64 i = 0; i < rels.size(); i++) {
     const ElfRel<E> &rel = rels[i];
@@ -278,9 +281,20 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
         rel.r_type == R_LARCH_ALIGN)
       continue;
 
+    i64 removed_bytes = 0;
+    i64 r_delta = 0;
+
+    if (!deltas.empty()) {
+      while (k < deltas.size() && deltas[k].offset < rel.r_offset)
+        k++;
+      if (k < deltas.size() && deltas[k].offset == rel.r_offset)
+        removed_bytes = get_removed_bytes(deltas, k);
+      if (k > 0)
+        r_delta = deltas[k - 1].delta;
+    }
+
     Symbol<E> &sym = *file.symbols[rel.r_sym];
-    i64 r_offset = rel.r_offset - get_r_delta(i);
-    i64 removed_bytes = get_r_delta(i + 1) - get_r_delta(i);
+    i64 r_offset = rel.r_offset - r_delta;
     u8 *loc = base + r_offset;
 
     auto check = [&](i64 val, i64 lo, i64 hi) {
@@ -355,7 +369,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       // It is contrary to the psABI document, but GNU ld has special
       // code to handle it, so we accept it too.
       if ((*(ul32 *)loc & 0xfc00'0000) == 0x4c00'0000)
-        write_k16(loc, sign_extend(S + A, 11) >> 2);
+        write_k16(loc, sign_extend(S + A, 12) >> 2);
       else
         write_k12(loc, S + A);
       break;
@@ -393,7 +407,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
         //   addi.d    $t0, $t0, 0
         if (is_relaxable_got_load(ctx, *this, i)) {
           i64 dist = compute_distance(ctx, sym, *this, rel);
-          if (-(1LL << 31) <= dist && dist < (1LL << 31)) {
+          if ((i32)dist == dist) {
             u32 rd = get_rd(*(ul32 *)loc);
             *(ul32 *)(loc + 4) = 0x02c0'0000 | (rd << 5) | rd; // addi.d
 
@@ -513,12 +527,14 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       break;
     case R_LARCH_CALL36:
       if (removed_bytes == 0) {
-        write_j20(loc, (S + A - P + 0x20000) >> 18);
-        write_k16(loc + 4, (S + A - P) >> 2);
+        i64 val = S + A - P;
+        check_branch(val, -(1LL << 37) - 0x20000, (1LL << 37) - 0x20000);
+        write_j20(loc, (val + 0x20000) >> 18);
+        write_k16(loc + 4, val >> 2);
       } else {
         // Rewrite PCADDU18I + JIRL to B or BL
         assert(removed_bytes == 4);
-        if (get_rd(*(ul32 *)(contents.data() + rel.r_offset + 4)) == 0)
+        if (get_rd(*(ul32 *)(buf + rel.r_offset + 4)) == 0)
           *(ul32 *)loc = 0x5000'0000; // B
         else
           *(ul32 *)loc = 0x5400'0000; // BL
@@ -547,63 +563,50 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       // We may relax the instructions to the following if its TP-relative
       // address is known at link-time
       //
-      //   <nop>
-      //   <nop>
+      //   <deleted>
+      //   <deleted>
       //   lu12i.w   $a0, foo@TPOFF
       //   addi.w    $a0, $a0, foo@TPOFF
       //
       // or to the following if the TP offset is small enough.
       //
-      //   <nop>
-      //   <nop>
-      //   <nop>
+      //   <deleted>
+      //   <deleted>
+      //   <deleted>
       //   ori       $a0, $zero, foo@TPOFF
       //
       // If the TP-relative address is known at process startup time, we
       // may relax the instructions to the following.
       //
-      //   <nop>
-      //   <nop>
+      //   <deleted>
+      //   <deleted>
       //   pcalau12i $a0, foo@GOTTP
       //   ld.[dw]   $a0, $a0, foo@GOTTP
       //
       // If we don't know anything about the symbol, we can still relax
       // the first two instructions to a single pcaddi as shown below.
       //
-      //   <nop>
+      //   <deleted>
       //   pcaddi    $a0, foo@GOTDESC
       //   ld.d      $ra, $a0, 0
       //   jirl      $ra, $ra, 0
       //
-      // Note that if section-shrinking relaxation is enabled, nop may be
-      // completely deleted.
-      if (removed_bytes == 0) {
-        if (sym.has_tlsdesc(ctx)) {
-          i64 dist = sym.get_tlsdesc_addr(ctx) + A - P;
-          if (ctx.arg.relax && -(1 << 21) <= dist && dist < (1 << 21)) {
-            *(ul32 *)loc = 0x0340'0000; // nop
-          } else {
-            write_j20(loc, hi20(sym.get_tlsdesc_addr(ctx) + A, P));
-          }
-        } else {
-          *(ul32 *)loc = 0x0340'0000; // nop
-        }
-      }
+      // If the code-shrinking relaxation is disabled, we may leave
+      // original useless instructions instead of deleting them, but we
+      // accept that because relaxations are enabled by default.
+      if (sym.has_tlsdesc(ctx) && removed_bytes == 0)
+        write_j20(loc, hi20(sym.get_tlsdesc_addr(ctx) + A, P));
       break;
     case R_LARCH_TLS_DESC_PC_LO12:
-      if (removed_bytes == 0) {
-        if (sym.has_tlsdesc(ctx)) {
-          i64 dist = sym.get_tlsdesc_addr(ctx) + A - P;
-          if (ctx.arg.relax && -(1 << 21) <= dist && dist < (1 << 21)) {
-            // If we can directly materialize the PC-relative address
-            // with pcaddi, do that.
-            *(ul32 *)loc = 0x1800'0000 | get_rd(*(ul32 *)loc); // pcaddi
-            write_j20(loc, dist >> 2);
-          } else {
-            write_k12(loc, sym.get_tlsdesc_addr(ctx) + A);
-          }
+      if (sym.has_tlsdesc(ctx) && removed_bytes == 0) {
+        i64 dist = sym.get_tlsdesc_addr(ctx) + A - P;
+        if (is_int(dist, 22)) {
+          // If we can directly materialize the PC-relative address
+          // with pcaddi, do that.
+          *(ul32 *)loc = 0x1800'0000 | get_rd(*(ul32 *)loc); // pcaddi
+          write_j20(loc, dist >> 2);
         } else {
-          *(ul32 *)loc = 0x0340'0000; // nop
+          write_k12(loc, sym.get_tlsdesc_addr(ctx) + A);
         }
       }
       break;
@@ -629,7 +632,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
         write_k12(loc, sym.get_gottp_addr(ctx) + A);
       } else {
         i64 val = S + A - ctx.tp_addr;
-        if (val < 0x1000)
+        if (0 <= val && val < 0x1000)
           *(ul32 *)loc = 0x0380'0004; // ori    $a0, $zero, 0
         else
           *(ul32 *)loc = 0x0280'0084; // addi.w $a0, $a0, 0
@@ -646,7 +649,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
 
       // Rewrite `addi.d $t0, $t0, <offset>` with `addi.d $t0, $tp, <offset>`
       // if the offset is directly accessible using tp. tp is r2.
-      if (sign_extend(val, 11) == val)
+      if (is_int(val, 12))
         set_rj(loc, 2);
       break;
     }
@@ -857,15 +860,20 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
 }
 
 template <>
-void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
+void shrink_section(Context<E> &ctx, InputSection<E> &isec) {
   std::span<const ElfRel<E>> rels = isec.get_rels(ctx);
-  isec.extra.r_deltas.resize(rels.size() + 1);
-  i64 delta = 0;
+  std::vector<RelocDelta> &deltas = isec.extra.r_deltas;
+  i64 r_delta = 0;
+  u8 *buf = (u8 *)isec.contents.data();
 
   for (i64 i = 0; i < rels.size(); i++) {
     const ElfRel<E> &r = rels[i];
     Symbol<E> &sym = *isec.file.symbols[r.r_sym];
-    isec.extra.r_deltas[i] = delta;
+
+    auto remove = [&](i64 d) {
+      r_delta += d;
+      deltas.push_back(RelocDelta{r.r_offset, r_delta});
+    };
 
     // A R_LARCH_ALIGN relocation refers to the beginning of a nop
     // sequence. We need to remove some or all of them so that the
@@ -890,9 +898,11 @@ void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
         alignment = r.r_addend + 4;
       }
 
-      u64 loc = isec.get_addr() + r.r_offset - delta;
-      u64 next_loc = loc + alignment - 4;
-      delta += next_loc - align_to(loc, alignment);
+      u64 P = isec.get_addr() + r.r_offset - r_delta;
+      u64 desired = align_to(P, alignment);
+      u64 actual = P + alignment - 4;
+      if (desired != actual)
+        remove(actual - desired);
       continue;
     }
 
@@ -921,8 +931,8 @@ void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
       //
       //  addi.d  $t0, $tp, <tp-offset>
       if (i64 val = sym.get_addr(ctx) + r.r_addend - ctx.tp_addr;
-          sign_extend(val, 11) == val)
-        delta += 4;
+          is_int(val, 12))
+        remove(4);
       break;
     case R_LARCH_PCALA_HI20:
       // The following two instructions are used to materialize a
@@ -940,14 +950,14 @@ void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
           rels[i + 2].r_offset == rels[i].r_offset + 4 &&
           rels[i + 3].r_type == R_LARCH_RELAX) {
         i64 dist = compute_distance(ctx, sym, isec, r);
-        u32 insn1 = *(ul32 *)(isec.contents.data() + rels[i].r_offset);
-        u32 insn2 = *(ul32 *)(isec.contents.data() + rels[i].r_offset + 4);
+        u32 insn1 = *(ul32 *)(buf + rels[i].r_offset);
+        u32 insn2 = *(ul32 *)(buf + rels[i].r_offset + 4);
         bool is_addi_d = (insn2 & 0xffc0'0000) == 0x02c0'0000;
 
-        if (dist % 4 == 0 && -(1 << 21) <= dist && dist < (1 << 21) &&
+        if ((dist & 0b11) == 0 && is_int(dist, 22) &&
             is_addi_d && get_rd(insn1) == get_rd(insn2) &&
             get_rd(insn2) == get_rj(insn2))
-          delta += 4;
+          remove(4);
       }
       break;
     case R_LARCH_CALL36:
@@ -960,10 +970,10 @@ void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
       // If the displacement is PC ± 128 MiB, we can use B or BL instead.
       // Note that $zero is $r0 and $ra is $r1.
       if (i64 dist = compute_distance(ctx, sym, isec, r);
-          -(1 << 27) <= dist && dist < (1 << 27))
-        if (u32 jirl = *(ul32 *)(isec.contents.data() + rels[i].r_offset + 4);
+          is_int(dist, 28))
+        if (u32 jirl = *(ul32 *)(buf + rels[i].r_offset + 4);
             get_rd(jirl) == 0 || get_rd(jirl) == 1)
-          delta += 4;
+          remove(4);
       break;
     case R_LARCH_GOT_PC_HI20:
       // The following two instructions are used to load a symbol address
@@ -976,36 +986,35 @@ void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
       // relax them to the following instruction.
       //
       //   pcaddi    $t0, <offset>
-      if (is_relaxable_got_load(ctx, isec, i)) {
-        i64 dist = compute_distance(ctx, sym, isec, r);
-        if (dist % 4 == 0 && -(1 << 21) <= dist && dist < (1 << 21))
-          delta += 4;
-      }
+      if (is_relaxable_got_load(ctx, isec, i))
+        if (i64 dist = compute_distance(ctx, sym, isec, r);
+            is_int(dist, 22))
+          remove(4);
       break;
     case R_LARCH_TLS_DESC_PC_HI20:
       if (sym.has_tlsdesc(ctx)) {
         u64 P = isec.get_addr() + r.r_offset;
         i64 dist = sym.get_tlsdesc_addr(ctx) + r.r_addend - P;
-        if (-(1 << 21) <= dist && dist < (1 << 21))
-          delta += 4;
+        if (is_int(dist, 22))
+          remove(4);
       } else {
-        delta += 4;
+        remove(4);
       }
       break;
     case R_LARCH_TLS_DESC_PC_LO12:
       if (!sym.has_tlsdesc(ctx))
-        delta += 4;
+        remove(4);
       break;
     case R_LARCH_TLS_DESC_LD:
-      if (!sym.has_tlsdesc(ctx) && !sym.has_gottp(ctx) &&
-          sym.get_addr(ctx) + r.r_addend - ctx.tp_addr < 0x1000)
-        delta += 4;
+      if (!sym.has_tlsdesc(ctx) && !sym.has_gottp(ctx))
+        if (i64 val = sym.get_addr(ctx) + r.r_addend - ctx.tp_addr;
+            0 <= val && val < 0x1000)
+          remove(4);
       break;
     }
   }
 
-  isec.extra.r_deltas[rels.size()] = delta;
-  isec.sh_size -= delta;
+  isec.sh_size -= r_delta;
 }
 
 } // namespace mold
diff --git a/src/arch-m68k.cc b/src/arch-m68k.cc
index edffe04801..2cce8ce9eb 100644
--- a/src/arch-m68k.cc
+++ b/src/arch-m68k.cc
@@ -1,10 +1,10 @@
 // This file contains code for the Motorola 68000 series microprocessors,
 // which is often abbreviated as m68k. Running a Unix-like system on a
-// m68k-based machine today is probably a retro-computing hobby activity,
-// but the processor was a popular choice to build Unix computers during
-// '80s. Early Sun workstations for example used m68k. Macintosh until
-// 1994 were based on m68k as well until they switched to PowerPC (and
-// then to x86 and to ARM.)
+// m68k-based machine today is a retro-computing hobby activity, but the
+// processor was a popular choice to build Unix computers during '80s.
+// Early Sun workstations for example used m68k. Macintosh until 1994 were
+// based on m68k as well until they switched to PowerPC (and then to x86
+// and to ARM.)
 //
 // From the linker's point of view, it is not hard to support m68k. It's
 // just a 32-bit big-endian CISC ISA. Compared to comtemporary i386,
diff --git a/src/arch-ppc32.cc b/src/arch-ppc32.cc
index 4525e73d48..9fccf8fd04 100644
--- a/src/arch-ppc32.cc
+++ b/src/arch-ppc32.cc
@@ -88,7 +88,7 @@ void write_plt_header(Context<E> &ctx, u8 *buf) {
   loc[5] |= lo(ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr + 4);
 }
 
-constexpr ub32 plt_entry[] = {
+static constexpr ub32 plt_entry[] = {
   // Get the address of this PLT entry
   0x7c08'02a6, // mflr    r0
   0x429f'0005, // bcl     20, 31, 4
@@ -213,15 +213,15 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
     case R_PPC_REL24:
     case R_PPC_LOCAL24PC: {
       i64 val = S + A - P;
-      if (sign_extend(val, 25) != val)
-        val = get_thunk_addr(i) - P;
+      if (!is_int(val, 26))
+        val = sym.get_thunk_addr(ctx, P) - P;
       *(ub32 *)loc |= bits(val, 25, 2) << 2;
       break;
     }
     case R_PPC_PLTREL24: {
       i64 val = S - P;
-      if (sym.has_plt(ctx) || sign_extend(val, 25) != val)
-        val = get_thunk_addr(i) - P;
+      if (sym.has_plt(ctx) || !is_int(val, 26))
+        val = sym.get_thunk_addr(ctx, P) - P;
       *(ub32 *)loc |= bits(val, 25, 2) << 2;
       break;
     }
diff --git a/src/arch-ppc64v1.cc b/src/arch-ppc64v1.cc
index e3ec1c5557..20b395f3cf 100644
--- a/src/arch-ppc64v1.cc
+++ b/src/arch-ppc64v1.cc
@@ -194,8 +194,8 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       break;
     case R_PPC64_REL24: {
       i64 val = sym.get_addr(ctx, NO_OPD) + A - P;
-      if (sym.has_plt(ctx) || sign_extend(val, 25) != val)
-        val = get_thunk_addr(i) + A - P;
+      if (sym.has_plt(ctx) || !is_int(val, 26))
+        val = sym.get_thunk_addr(ctx, P) + A - P;
 
       check(val, -(1 << 25), 1 << 25);
       *(ub32 *)loc |= bits(val, 25, 2) << 2;
@@ -507,12 +507,13 @@ get_relocation_at(Context<E> &ctx, InputSection<E> &isec, i64 offset) {
   return &*it;
 }
 
+namespace {
 struct OpdSymbol {
   bool operator<(const OpdSymbol &x) const { return r_offset < x.r_offset; }
-
   u64 r_offset = 0;
   Symbol<E> *sym = nullptr;
 };
+}
 
 static Symbol<E> *
 get_opd_sym_at(std::span<OpdSymbol> syms, u64 offset) {
@@ -589,7 +590,7 @@ void ppc64v1_rewrite_opd(Context<E> &ctx) {
 
       Symbol<E> *sym2 = file->symbols[rel->r_sym];
       if (sym2->get_type() != STT_SECTION)
-        Fatal(ctx) << *file << ": bad relocation in .opd referring " << *sym2;
+        Fatal(ctx) << *file << ": bad relocation in .opd referring to " << *sym2;
 
       opd_syms.push_back({sym->value, sym});
 
diff --git a/src/arch-ppc64v2.cc b/src/arch-ppc64v2.cc
index fdb5d568c1..767d45f5f2 100644
--- a/src/arch-ppc64v2.cc
+++ b/src/arch-ppc64v2.cc
@@ -201,8 +201,8 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
     u64 GOT = ctx.got->shdr.sh_addr;
     u64 TOC = ctx.extra.TOC->value;
 
-    auto r2save_thunk_addr = [&] { return get_thunk_addr(i); };
-    auto no_r2save_thunk_addr = [&] { return get_thunk_addr(i) + 8; };
+    auto r2save_thunk_addr = [&] { return sym.get_thunk_addr(ctx, P); };
+    auto no_r2save_thunk_addr = [&] { return sym.get_thunk_addr(ctx, P) + 8; };
 
     switch (rel.r_type) {
     case R_PPC64_TOC16_HA:
@@ -227,7 +227,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
           *(ul32 *)(loc + 4) = 0xe841'0018; // ld r2, 24(r1)
       } else {
         i64 val = S + get_local_entry_offset(ctx, sym) + A - P;
-        if (sign_extend(val, 25) != val)
+        if (!is_int(val, 26))
           val = no_r2save_thunk_addr() + A - P;
         *(ul32 *)loc |= bits(val, 25, 2) << 2;
       }
@@ -238,7 +238,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
         *(ul32 *)loc |= bits(val, 25, 2) << 2;
       } else {
         i64 val = S + A - P;
-        if (sign_extend(val, 25) != val)
+        if (!is_int(val, 26))
           val = no_r2save_thunk_addr() + A - P;
         *(ul32 *)loc |= bits(val, 25, 2) << 2;
       }
@@ -334,6 +334,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
     case R_PPC64_TLS:
     case R_PPC64_TLSGD:
     case R_PPC64_TLSLD:
+    case R_PPC64_ENTRY:
       break;
     default:
       unreachable();
@@ -467,6 +468,7 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
     case R_PPC64_DTPREL16_LO:
     case R_PPC64_DTPREL16_LO_DS:
     case R_PPC64_DTPREL34:
+    case R_PPC64_ENTRY:
       break;
     default:
       Error(ctx) << *this << ": unknown relocation: " << rel;
diff --git a/src/arch-riscv.cc b/src/arch-riscv.cc
index fd600b613c..be82d7a1d8 100644
--- a/src/arch-riscv.cc
+++ b/src/arch-riscv.cc
@@ -22,8 +22,6 @@
 #include "mold.h"
 
 #include <regex>
-#include <tbb/parallel_for.h>
-#include <tbb/parallel_for_each.h>
 
 namespace mold {
 
@@ -88,6 +86,10 @@ static void set_rs1(u8 *loc, u32 rs1) {
   *(ul32 *)loc |= rs1 << 15;
 }
 
+static u32 get_rd(u8 *loc) {
+  return bits(*(u32 *)loc, 11, 7);
+};
+
 template <>
 void write_plt_header<E>(Context<E> &ctx, u8 *buf) {
   constexpr ul32 insn_64[] = {
@@ -121,14 +123,14 @@ void write_plt_header<E>(Context<E> &ctx, u8 *buf) {
   write_itype(buf + 16, gotplt - plt);
 }
 
-constexpr ul32 plt_entry_64[] = {
+static constexpr ul32 plt_entry_64[] = {
   0x0000'0e17, // auipc   t3, %pcrel_hi(function@.got.plt)
   0x000e'3e03, // ld      t3, %pcrel_lo(1b)(t3)
   0x000e'0367, // jalr    t1, t3
   0x0010'0073, // ebreak
 };
 
-constexpr ul32 plt_entry_32[] = {
+static constexpr ul32 plt_entry_32[] = {
   0x0000'0e17, // auipc   t3, %pcrel_hi(function@.got.plt)
   0x000e'2e03, // lw      t3, %pcrel_lo(1b)(t3)
   0x000e'0367, // jalr    t1, t3
@@ -198,30 +200,75 @@ void EhFrameSection<E>::apply_eh_reloc(Context<E> &ctx, const ElfRel<E> &rel,
   }
 }
 
-static inline bool is_hi20(const ElfRel<E> &rel) {
-  u32 ty = rel.r_type;
-  return ty == R_RISCV_GOT_HI20 || ty == R_RISCV_TLS_GOT_HI20 ||
-         ty == R_RISCV_TLS_GD_HI20 || ty == R_RISCV_PCREL_HI20 ||
-         ty == R_RISCV_TLSDESC_HI20;
+// RISC-V generally uses the AUIPC + ADDI/LW/SW/etc instruction pair
+// to access the AUIPC's address ± 2 GiB. AUIPC materializes the most
+// significant 52 bits in a PC-relative manner, and the following
+// instruction specifies the remaining least significant 12 bits.
+// There are several HI20 and LO12 relocation types for them.
+//
+// LO12 relocations need to materialize an address relative to AUIPC's
+// address, not relative to the instruction that the relocation
+// directly refers to.
+//
+// The problem here is that the instruction pair may not always be
+// adjacent. We need a mechanism to find a paired AUIPC for a given
+// LO12 relocation. For this purpose, the compiler creates a local
+// symbol for each location to which HI20 refers, and the LO12
+// relocation refers to that symbol.
+//
+// This function returns a paired HI20 relocation for a given LO12.
+// Since the instructions are typically adjacent, we do a linear
+// search.
+static const ElfRel<E> &
+find_paired_reloc(Context<E> &ctx, InputSection<E> &isec,
+                  std::span<const ElfRel<E>> rels,
+                  Symbol<E> &sym, i64 i) {
+  auto is_hi20 = [](u32 ty) {
+    return ty == R_RISCV_GOT_HI20 || ty == R_RISCV_TLS_GOT_HI20 ||
+           ty == R_RISCV_TLS_GD_HI20 || ty == R_RISCV_PCREL_HI20 ||
+           ty == R_RISCV_TLSDESC_HI20;
+  };
+
+  u64 value = sym.esym().st_value;
+
+  if (value <= rels[i].r_offset) {
+    for (i64 j = i - 1; j >= 0; j--)
+      if (is_hi20(rels[j].r_type) && value == rels[j].r_offset)
+        return rels[j];
+  } else {
+    for (i64 j = i + 1; j < rels.size(); j++)
+      if (is_hi20(rels[j].r_type) && value == rels[j].r_offset)
+        return rels[j];
+  }
+  Fatal(ctx) << isec << ": paired relocation is missing: " << i;
 }
 
 template <>
 void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
   std::span<const ElfRel<E>> rels = get_rels(ctx);
-  u64 GP = ctx.__global_pointer ? ctx.__global_pointer->get_addr(ctx) : 0;
-
-  auto get_r_delta = [&](i64 idx) {
-    return extra.r_deltas.empty() ? 0 : extra.r_deltas[idx];
-  };
+  std::span<RelocDelta> deltas = extra.r_deltas;
+  i64 k = 0;
+  u8 *buf = (u8 *)contents.data();
 
   for (i64 i = 0; i < rels.size(); i++) {
     const ElfRel<E> &rel = rels[i];
     if (rel.r_type == R_NONE || rel.r_type == R_RISCV_RELAX)
       continue;
 
+    i64 removed_bytes = 0;
+    i64 r_delta = 0;
+
+    if (!deltas.empty()) {
+      while (k < deltas.size() && deltas[k].offset < rel.r_offset)
+        k++;
+      if (k < deltas.size() && deltas[k].offset == rel.r_offset)
+        removed_bytes = get_removed_bytes(deltas, k);
+      if (k > 0)
+        r_delta = deltas[k - 1].delta;
+    }
+
     Symbol<E> &sym = *file.symbols[rel.r_sym];
-    i64 r_offset = rel.r_offset - get_r_delta(i);
-    i64 removed_bytes = get_r_delta(i + 1) - get_r_delta(i);
+    i64 r_offset = rel.r_offset - r_delta;
     u8 *loc = base + r_offset;
 
     auto check = [&](i64 val, i64 lo, i64 hi) {
@@ -231,25 +278,6 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
                    << lo << ", " << hi << ")";
     };
 
-    auto find_paired_reloc = [&] {
-      if (sym.value <= rels[i].r_offset - get_r_delta(i)) {
-        for (i64 j = i - 1; j >= 0; j--)
-          if (is_hi20(rels[j]) && sym.value == rels[j].r_offset - get_r_delta(j))
-            return j;
-      } else {
-        for (i64 j = i + 1; j < rels.size(); j++)
-          if (is_hi20(rels[j]) && sym.value == rels[j].r_offset - get_r_delta(j))
-            return j;
-      }
-
-      Fatal(ctx) << *this << ": paired relocation is missing: " << i;
-    };
-
-    auto get_rd = [&](i64 offset) {
-      // Returns the rd register of an R/I/U/J-type instruction.
-      return bits(*(ul32 *)(contents.data() + offset), 11, 7);
-    };
-
     u64 S = sym.get_addr(ctx);
     u64 A = rel.r_addend;
     u64 P = get_addr() + r_offset;
@@ -274,7 +302,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
     case R_RISCV_CALL:
     case R_RISCV_CALL_PLT: {
       i64 val = S + A - P;
-      i64 rd = get_rd(rel.r_offset + 4);
+      i64 rd = get_rd(buf + rel.r_offset + 4);
 
       // Calling an undefined weak symbol does not make sense.
       // We make such call into an infinite loop. This should
@@ -308,22 +336,20 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       // pair to load a symbol value from the GOT. If the symbol value
       // is actually a link-time constant, we can materialize the value
       // directly into a register to eliminate a memory load.
-      i64 rd = get_rd(rel.r_offset);
+      i64 rd = get_rd(buf + rel.r_offset);
 
-      switch (removed_bytes) {
-      case 6:
+      if (removed_bytes == 6) {
         // c.li <rd>, val
         *(ul16 *)loc = 0b010'0'00000'00000'01 | (rd << 7);
         write_citype(loc, sym.get_addr(ctx));
         i += 3;
-        break;
-      case 4:
+      } else if (removed_bytes == 4) {
         // addi <rd>, zero, val
         *(ul32 *)loc = 0b0010011 | (rd << 7);
         write_itype(loc, sym.get_addr(ctx));
         i += 3;
-        break;
-      case 0:
+      } else {
+        assert(removed_bytes == 0);
         if (ctx.arg.relax &&
             sym.is_pcrel_linktime_const(ctx) &&
             i + 3 < rels.size() &&
@@ -333,7 +359,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
             file.symbols[rels[i + 2].r_sym]->value == r_offset &&
             rels[i + 3].r_type == R_RISCV_RELAX) {
           i64 val = S + A - P;
-          if (rd == get_rd(rel.r_offset + 4) && (i32)val == val) {
+          if (rd == get_rd(buf + rel.r_offset + 4) && (i32)val == val) {
             // auipc <rd>, %hi20(val)
             write_utype(loc, val);
 
@@ -344,11 +370,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
             break;
           }
         }
-
         write_utype(loc, G + GOT + A - P);
-        break;
-      default:
-        unreachable();
       }
       break;
     }
@@ -362,50 +384,38 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       write_utype(loc, S + A - P);
       break;
     case R_RISCV_PCREL_LO12_I:
-    case R_RISCV_PCREL_LO12_S:
-    case R_RISCV_GPREL_LO12_I:
-    case R_RISCV_GPREL_LO12_S: {
-      i64 idx2 = find_paired_reloc();
-      const ElfRel<E> &rel2 = rels[idx2];
+    case R_RISCV_PCREL_LO12_S: {
+      const ElfRel<E> &rel2 = find_paired_reloc(ctx, *this, rels, sym, i);
       Symbol<E> &sym2 = *file.symbols[rel2.r_sym];
 
+      auto write =
+        (rel.r_type == R_RISCV_PCREL_LO12_I) ? write_itype : write_stype;
+
       u64 S = sym2.get_addr(ctx);
       u64 A = rel2.r_addend;
-      u64 P = get_addr() + rel2.r_offset - get_r_delta(idx2);
+      u64 P = get_addr() + rel2.r_offset - get_r_delta(*this, rel2.r_offset);
       u64 G = sym2.get_got_idx(ctx) * sizeof(Word<E>);
-      u64 val;
 
       switch (rel2.r_type) {
       case R_RISCV_GOT_HI20:
-        val = G + GOT + A - P;
+        write(loc, G + GOT + A - P);
         break;
       case R_RISCV_TLS_GOT_HI20:
-        val = sym2.get_gottp_addr(ctx) + A - P;
+        write(loc, sym2.get_gottp_addr(ctx) + A - P);
         break;
       case R_RISCV_TLS_GD_HI20:
-        val = sym2.get_tlsgd_addr(ctx) + A - P;
+        write(loc, sym2.get_tlsgd_addr(ctx) + A - P);
         break;
       case R_RISCV_PCREL_HI20:
-        val = S + A - P;
-        break;
-      case R_RISCV_GPREL_HI20:
-        val = S + A - GP;
+        write(loc, S + A - P);
         break;
-      default:
-        unreachable();
       }
-
-      if (rel.r_type == R_RISCV_PCREL_LO12_I ||
-          rel.r_type == R_RISCV_GPREL_LO12_I)
-        write_itype(loc, val);
-      else
-        write_stype(loc, val);
       break;
     }
     case R_RISCV_HI20:
       if (removed_bytes == 2) {
         // Rewrite LUI with C.LUI
-        i64 rd = get_rd(rel.r_offset);
+        i64 rd = get_rd(buf + rel.r_offset);
         *(ul16 *)loc = 0b011'0'00000'00000'01 | (rd << 7);
         write_citype(loc, (S + A + 0x800) >> 12);
       } else if (removed_bytes == 0) {
@@ -423,7 +433,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       // Rewrite `lw t1, 0(t0)` with `lw t1, 0(x0)` if the address is
       // accessible relative to the zero register because if that's the
       // case, corresponding LUI might have been removed by relaxation.
-      if (sign_extend(S + A, 11) == S + A)
+      if (is_int(S + A, 12))
         set_rs1(loc, 0);
       break;
     case R_RISCV_TPREL_HI20:
@@ -447,7 +457,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
 
       // Rewrite `lw t1, 0(t0)` with `lw t1, 0(tp)` if the address is
       // directly accessible using tp. tp is x4.
-      if (sign_extend(val, 11) == val)
+      if (is_int(val, 12))
         set_rs1(loc, 4);
       break;
     }
@@ -486,7 +496,11 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       //   <deleted>
       //   lui    a0, %tpoff_hi(a0)
       //   addi   a0, a0, %tpoff_lo(a0)
-      if (removed_bytes == 0)
+      //
+      // If the code-shrinking relaxation is disabled, we may leave
+      // original useless instructions instead of deleting them, but we
+      // accept that because relaxations are enabled by default.
+      if (sym.has_tlsdesc(ctx) && removed_bytes == 0)
         write_utype(loc, sym.get_tlsdesc_addr(ctx) + A - P);
       break;
     case R_RISCV_TLSDESC_LOAD_LO12:
@@ -495,17 +509,19 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       if (removed_bytes == 4)
         break;
 
-      i64 idx2 = find_paired_reloc();
-      const ElfRel<E> &rel2 = rels[idx2];
+      const ElfRel<E> &rel2 = find_paired_reloc(ctx, *this, rels, sym, i);
       Symbol<E> &sym2 = *file.symbols[rel2.r_sym];
 
       u64 S = sym2.get_addr(ctx);
       u64 A = rel2.r_addend;
-      u64 P = get_addr() + rel2.r_offset - get_r_delta(idx2);
+      u64 P = get_addr() + rel2.r_offset - get_r_delta(*this, rel2.r_offset);
 
       switch (rel.r_type) {
       case R_RISCV_TLSDESC_LOAD_LO12:
-        write_itype(loc, sym2.get_tlsdesc_addr(ctx) + A - P);
+        if (sym2.has_tlsdesc(ctx))
+          write_itype(loc, sym2.get_tlsdesc_addr(ctx) + A - P);
+        else
+          *(ul32 *)loc = 0x13; // nop
         break;
       case R_RISCV_TLSDESC_ADD_LO12:
         if (sym2.has_tlsdesc(ctx)) {
@@ -527,7 +543,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
           write_itype(loc, sym2.get_gottp_addr(ctx) + A - P);
         } else {
           i64 val = S + A - ctx.tp_addr;
-          if (sign_extend(val, 11) == val)
+          if (is_int(val, 12))
             *(ul32 *)loc = 0x513;   // addi a0,zero,<lo12>
           else
             *(ul32 *)loc = 0x50513; // addi a0,a0,<lo12>
@@ -587,10 +603,6 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       check(S + A - P, -(1 << 11), 1 << 11);
       write_cjtype(loc, S + A - P);
       break;
-    case R_RISCV_GPREL_HI20:
-      check(S + A - GP, -(1LL << 31), 1LL << 31);
-      write_utype(loc, S + A - GP);
-      break;
     case R_RISCV_SUB6:
       *loc = (*loc & 0b1100'0000) | ((*loc - S - A) & 0b0011'1111);
       break;
@@ -756,10 +768,6 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
     case R_RISCV_TPREL_ADD:
       check_tlsle(ctx, sym, rel);
       break;
-    case R_RISCV_GPREL_HI20:
-      if (ctx.arg.shared)
-        Error(ctx) << *this << ": R_RISCV_GPREL_HI20 may not be used with -shared";
-      break;
     case R_RISCV_64:
     case R_RISCV_BRANCH:
     case R_RISCV_JAL:
@@ -781,8 +789,6 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
     case R_RISCV_ALIGN:
     case R_RISCV_RVC_BRANCH:
     case R_RISCV_RVC_JUMP:
-    case R_RISCV_GPREL_LO12_I:
-    case R_RISCV_GPREL_LO12_S:
     case R_RISCV_RELAX:
     case R_RISCV_SUB6:
     case R_RISCV_SET6:
@@ -806,9 +812,9 @@ u64 get_eflags(Context<E> &ctx) {
   if (objs.empty())
     return 0;
 
-  u32 ret = objs[0]->get_ehdr().e_flags;
+  u32 ret = objs[0]->get_eflags();
   for (i64 i = 1; i < objs.size(); i++) {
-    u32 flags = objs[i]->get_ehdr().e_flags;
+    u32 flags = objs[i]->get_eflags();
     if (flags & EF_RISCV_RVC)
       ret |= EF_RISCV_RVC;
 
@@ -823,22 +829,26 @@ u64 get_eflags(Context<E> &ctx) {
   return ret;
 }
 
-// Scan relocations to a given shrink section.
+// Scan relocations to shrink a given section.
 template <>
-void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
+void shrink_section(Context<E> &ctx, InputSection<E> &isec) {
   std::span<const ElfRel<E>> rels = isec.get_rels(ctx);
-  isec.extra.r_deltas.resize(rels.size() + 1);
+  std::vector<RelocDelta> &deltas = isec.extra.r_deltas;
+  i64 r_delta = 0;
+  u8 *buf = (u8 *)isec.contents.data();
 
-  auto get_rd = [&](i64 offset) {
-    return bits(*(ul32 *)(isec.contents.data() + offset), 11, 7);
-  };
-
-  i64 delta = 0;
+  // True if we can use 2-byte instructions. This is usually true on
+  // Unix because RV64GC is generally considered the baseline hardware.
+  bool use_rvc = isec.file.get_eflags() & EF_RISCV_RVC;
 
   for (i64 i = 0; i < rels.size(); i++) {
     const ElfRel<E> &r = rels[i];
     Symbol<E> &sym = *isec.file.symbols[r.r_sym];
-    isec.extra.r_deltas[i] = delta;
+
+    auto remove = [&](i64 d) {
+      r_delta += d;
+      deltas.push_back(RelocDelta{r.r_offset, r_delta});
+    };
 
     // Handling R_RISCV_ALIGN is mandatory.
     //
@@ -848,11 +858,11 @@ void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
     if (r.r_type == R_RISCV_ALIGN) {
       // The total bytes of NOPs is stored to r_addend, so the next
       // instruction is r_addend away.
-      u64 loc = isec.get_addr() + r.r_offset - delta;
-      u64 next_loc = loc + r.r_addend;
-      u64 alignment = bit_ceil(r.r_addend + 1);
-      assert(alignment <= (1 << isec.p2align));
-      delta += next_loc - align_to(loc, alignment);
+      u64 P = isec.get_addr() + r.r_offset - r_delta;
+      u64 desired = align_to(P, bit_ceil(r.r_addend));
+      u64 actual = P + r.r_addend;
+      if (desired != actual)
+        remove(actual - desired);
       continue;
     }
 
@@ -869,20 +879,6 @@ void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
     if (sym.file == ctx.internal_obj)
       continue;
 
-    auto find_paired_reloc = [&] {
-      if (sym.value <= rels[i].r_offset) {
-        for (i64 j = i - 1; j >= 0; j--)
-          if (is_hi20(rels[j]) && sym.value == rels[j].r_offset)
-            return j;
-      } else {
-        for (i64 j = i + 1; j < rels.size(); j++)
-          if (is_hi20(rels[j]) && sym.value == rels[j].r_offset)
-            return j;
-      }
-
-      Fatal(ctx) << isec << ": paired relocation is missing: " << i;
-    };
-
     switch (r.r_type) {
     case R_RISCV_CALL:
     case R_RISCV_CALL_PLT: {
@@ -893,19 +889,19 @@ void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
       if (dist & 1)
         break;
 
-      i64 rd = get_rd(r.r_offset + 4);
+      i64 rd = get_rd(buf + r.r_offset + 4);
 
-      if (use_rvc && rd == 0 && sign_extend(dist, 11) == dist) {
+      if (use_rvc && rd == 0 && is_int(dist, 12)) {
         // If rd is x0 and the jump target is within ±2 KiB, we can use
         // C.J, saving 6 bytes.
-        delta += 6;
-      } else if (use_rvc && !E::is_64 && rd == 1 && sign_extend(dist, 11) == dist) {
+        remove(6);
+      } else if (use_rvc && !E::is_64 && rd == 1 && is_int(dist, 12)) {
         // If rd is x1 and the jump target is within ±2 KiB, we can use
         // C.JAL. This is RV32 only because C.JAL is RV32-only instruction.
-        delta += 6;
-      } else if (sign_extend(dist, 20) == dist) {
+        remove(6);
+      } else if (is_int(dist, 21)) {
         // If the jump target is within ±1 MiB, we can use JAL.
-        delta += 4;
+        remove(4);
       }
       break;
     }
@@ -921,17 +917,17 @@ void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
           rels[i + 2].r_offset == rels[i].r_offset + 4 &&
           isec.file.symbols[rels[i + 2].r_sym]->value == rels[i].r_offset &&
           rels[i + 3].r_type == R_RISCV_RELAX) {
-        i64 rd = get_rd(r.r_offset);
+        i64 rd = get_rd(buf + r.r_offset);
 
-        if (rd == get_rd(r.r_offset + 4)) {
+        if (rd == get_rd(buf + r.r_offset + 4)) {
           u64 val = sym.get_addr(ctx) + r.r_addend;
 
-          if (use_rvc && rd != 0 && sign_extend(val, 5) == val) {
+          if (use_rvc && rd != 0 && is_int(val, 6)) {
             // Replace AUIPC + LD with C.LI.
-            delta += 6;
-          } else if (sign_extend(val, 11) == val) {
+            remove(6);
+          } else if (is_int(val, 12)) {
             // Replace AUIPC + LD with ADDI.
-            delta += 4;
+            remove(4);
           }
         }
       }
@@ -939,17 +935,17 @@ void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
     }
     case R_RISCV_HI20: {
       u64 val = sym.get_addr(ctx) + r.r_addend;
-      i64 rd = get_rd(r.r_offset);
+      i64 rd = get_rd(buf + r.r_offset);
 
-      if (sign_extend(val, 11) == val) {
+      if (is_int(val, 12)) {
         // We can replace `lui t0, %hi(foo)` and `add t0, t0, %lo(foo)`
         // instruction pair with `add t0, x0, %lo(foo)` if foo's bits
         // [32:11] are all one or all zero.
-        delta += 4;
-      } else if (use_rvc && rd != 0 && rd != 2 && sign_extend(val, 17) == val) {
+        remove(4);
+      } else if (use_rvc && rd != 0 && rd != 2 && is_int(val + 0x800, 18)) {
         // If the upper 20 bits can actually be represented in 6 bits,
         // we can use C.LUI instead of LUI.
-        delta += 2;
+        remove(2);
       }
       break;
     }
@@ -975,35 +971,34 @@ void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
       //
       // Here, we remove `lui` and `add` if the offset is within ±2 KiB.
       if (i64 val = sym.get_addr(ctx) + r.r_addend - ctx.tp_addr;
-          sign_extend(val, 11) == val)
-        delta += 4;
+          is_int(val, 12))
+        remove(4);
       break;
     case R_RISCV_TLSDESC_HI20:
       if (!sym.has_tlsdesc(ctx))
-        delta += 4;
+        remove(4);
       break;
     case R_RISCV_TLSDESC_LOAD_LO12:
     case R_RISCV_TLSDESC_ADD_LO12: {
-      const ElfRel<E> &rel2 = rels[find_paired_reloc()];
+      const ElfRel<E> &rel2 = find_paired_reloc(ctx, isec, rels, sym, i);
       Symbol<E> &sym2 = *isec.file.symbols[rel2.r_sym];
 
       if (r.r_type == R_RISCV_TLSDESC_LOAD_LO12) {
         if (!sym2.has_tlsdesc(ctx))
-          delta += 4;
+          remove(4);
       } else {
         assert(r.r_type == R_RISCV_TLSDESC_ADD_LO12);
         if (!sym2.has_tlsdesc(ctx) && !sym2.has_gottp(ctx))
           if (i64 val = sym2.get_addr(ctx) + rel2.r_addend - ctx.tp_addr;
-              sign_extend(val, 11) == val)
-            delta += 4;
+              is_int(val, 12))
+            remove(4);
       }
       break;
     }
     }
   }
 
-  isec.extra.r_deltas[rels.size()] = delta;
-  isec.sh_size -= delta;
+  isec.sh_size -= r_delta;
 }
 
 // ISA name handlers
diff --git a/src/arch-s390x.cc b/src/arch-s390x.cc
index dedc607c76..ad70446815 100644
--- a/src/arch-s390x.cc
+++ b/src/arch-s390x.cc
@@ -33,7 +33,7 @@
 // we need to add TP to a return value before use. I don't know why it is
 // different, but that is the way it is.
 //
-// https://github.com/rui314/psabi/blob/main/s390x.pdf
+// https://github.com/IBM/s390x-abi/releases/download/v1.6.1/lzsabi_s390x.pdf
 
 #include "mold.h"
 
@@ -170,9 +170,6 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       check(S + A, 0, 1LL << 32);
       *(ub32 *)loc = S + A;
       break;
-    case R_390_PLT64:
-      *(ub64 *)loc = S + A;
-      break;
     case R_390_PC12DBL:
     case R_390_PLT12DBL:
       check_dbl(S + A - P, -(1 << 12), 1 << 12);
@@ -187,6 +184,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       *(ub32 *)loc = S + A - P;
       break;
     case R_390_PC64:
+    case R_390_PLT64:
       *(ub64 *)loc = S + A - P;
       break;
     case R_390_PC16DBL:
@@ -250,6 +248,19 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       *(ub32 *)loc = (GOT + A - P) >> 1;
       break;
     case R_390_GOTENT:
+      // If we can relax a GOT-loading LGRL to an address-materializing
+      // LARL, do that. The format of LGRL is 0xc 0x4 <reg> 0x8 followed
+      // by a 32-bit offset. LARL is 0xc 0x0 <reg> 0x0.
+      if (ctx.arg.relax && sym.is_pcrel_linktime_const(ctx)) {
+        u64 op = *(ub16 *)(loc - 2);
+        u64 val = S + A - P;
+        if ((op & 0xff0f) == 0xc408 && A == 2 && (val & 1) == 0 &&
+            is_int(val, 33)) {
+          *(ub16 *)(loc - 2) = 0xc000 | (op & 0x00f0);
+          *(ub32 *)loc = val >> 1;
+          break;
+        }
+      }
       check_dbl(GOT + G + A - P, -(1LL << 32), 1LL << 32);
       *(ub32 *)loc = (GOT + G + A - P) >> 1;
       break;
@@ -398,8 +409,10 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
     case R_390_32:
       scan_absrel(ctx, sym, rel);
       break;
+    case R_390_PC12DBL:
     case R_390_PC16:
     case R_390_PC16DBL:
+    case R_390_PC24DBL:
     case R_390_PC32:
     case R_390_PC32DBL:
     case R_390_PC64:
diff --git a/src/arch-sh4.cc b/src/arch-sh4.cc
index 8e5d336a61..988800005a 100644
--- a/src/arch-sh4.cc
+++ b/src/arch-sh4.cc
@@ -58,11 +58,13 @@
 //    output from the linker contains lots of text relocations. That's not
 //    a problem with embedded programming, I guess.
 
+#if MOLD_SH4LE || MOLD_SH4BE
+
 #include "mold.h"
 
 namespace mold {
 
-using E = SH4;
+using E = MOLD_TARGET;
 
 // Even though SH-4 uses RELA-type relocations, addends are stored to
 // relocated places for some reason.
@@ -84,7 +86,7 @@ i64 get_addend(u8 *loc, const ElfRel<E> &rel) {
   case R_SH_GOTOFF:
   case R_SH_GOTPC:
   case R_SH_GOTPLT32:
-    return *(ul32 *)loc;
+    return *(U32<E> *)loc;
   default:
     return 0;
   }
@@ -108,102 +110,102 @@ void write_addend(u8 *loc, i64 val, const ElfRel<E> &rel) {
   case R_SH_GOTOFF:
   case R_SH_GOTPC:
   case R_SH_GOTPLT32:
-    *(ul32 *)loc = val;
+    *(U32<E> *)loc = val;
   }
 }
 
 template <>
 void write_plt_header(Context<E> &ctx, u8 *buf) {
   if (ctx.arg.pic) {
-    static const u8 insn[] = {
-      0x02, 0xd2, //    mov.l   1f, r2
-      0xcc, 0x32, //    add     r12, r2
-      0x22, 0x50, //    mov.l   @(8, r2), r0
-      0x21, 0x52, //    mov.l   @(4, r2), r2
-      0x2b, 0x40, //    jmp     @r0
-      0x00, 0xe0, //    mov     #0, r0
-      0, 0, 0, 0, // 1: .long GOTPLT
+    constexpr U16<E> insn[] = {
+      0xd202, //    mov.l   1f, r2
+      0x32cc, //    add     r12, r2
+      0x5022, //    mov.l   @(8, r2), r0
+      0x5221, //    mov.l   @(4, r2), r2
+      0x402b, //    jmp     @r0
+      0xe000, //    mov     #0, r0
+      0, 0,   // 1: .long GOTPLT
     };
 
     static_assert(sizeof(insn) == E::plt_hdr_size);
     memcpy(buf, insn, sizeof(insn));
-    *(ul32 *)(buf + 12) = ctx.gotplt->shdr.sh_addr - ctx.got->shdr.sh_addr;
+    *(U32<E> *)(buf + 12) = ctx.gotplt->shdr.sh_addr - ctx.got->shdr.sh_addr;
   } else {
-    static const u8 insn[] = {
-      0x02, 0xd2, //    mov.l   1f, r2
-      0x22, 0x50, //    mov.l   @(8, r2), r0
-      0x21, 0x52, //    mov.l   @(4, r2), r2
-      0x2b, 0x40, //    jmp     @r0
-      0x00, 0xe0, //    mov     #0, r0
-      0x09, 0x00, //    nop
-      0, 0, 0, 0, // 1: .long GOTPLT
+    constexpr U16<E> insn[] = {
+      0xd202, //    mov.l   1f, r2
+      0x5022, //    mov.l   @(8, r2), r0
+      0x5221, //    mov.l   @(4, r2), r2
+      0x402b, //    jmp     @r0
+      0xe000, //    mov     #0, r0
+      0x0009, //    nop
+      0, 0,   // 1: .long GOTPLT
     };
 
     static_assert(sizeof(insn) == E::plt_hdr_size);
     memcpy(buf, insn, sizeof(insn));
-    *(ul32 *)(buf + 12) = ctx.gotplt->shdr.sh_addr;
+    *(U32<E> *)(buf + 12) = ctx.gotplt->shdr.sh_addr;
   }
 }
 
 template <>
 void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
   if (ctx.arg.pic) {
-    static const u8 insn[] = {
-      0x01, 0xd0, //    mov.l   1f, r0
-      0xce, 0x00, //    mov.l   @(r0, r12), r0
-      0x2b, 0x40, //    jmp     @r0
-      0x01, 0xd1, //    mov.l   2f, r1
-      0, 0, 0, 0, // 1: .long GOTPLT_ENTRY
-      0, 0, 0, 0, // 2: .long INDEX_IN_RELPLT
+    constexpr U16<E> insn[] = {
+      0xd001, //    mov.l   1f, r0
+      0x00ce, //    mov.l   @(r0, r12), r0
+      0x402b, //    jmp     @r0
+      0xd101, //    mov.l   2f, r1
+      0, 0,   // 1: .long GOTPLT_ENTRY
+      0, 0,   // 2: .long INDEX_IN_RELPLT
     };
 
     static_assert(sizeof(insn) == E::plt_size);
     memcpy(buf, insn, sizeof(insn));
-    *(ul32 *)(buf + 8) = sym.get_gotplt_addr(ctx) - ctx.got->shdr.sh_addr;
-    *(ul32 *)(buf + 12) = sym.get_plt_idx(ctx) * sizeof(ElfRel<E>);
+    *(U32<E> *)(buf + 8) = sym.get_gotplt_addr(ctx) - ctx.got->shdr.sh_addr;
+    *(U32<E> *)(buf + 12) = sym.get_plt_idx(ctx) * sizeof(ElfRel<E>);
   } else {
-    static const u8 insn[] = {
-      0x01, 0xd0, //    mov.l   1f, r0
-      0x02, 0x60, //    mov.l   @r0, r0
-      0x2b, 0x40, //    jmp     @r0
-      0x01, 0xd1, //    mov.l   2f, r1
-      0, 0, 0, 0, // 1: .long GOTPLT_ENTRY
-      0, 0, 0, 0, // 2: .long INDEX_IN_RELPLT
+    constexpr U16<E> insn[] = {
+      0xd001, //    mov.l   1f, r0
+      0x6002, //    mov.l   @r0, r0
+      0x402b, //    jmp     @r0
+      0xd101, //    mov.l   2f, r1
+      0, 0,   // 1: .long GOTPLT_ENTRY
+      0, 0,   // 2: .long INDEX_IN_RELPLT
     };
 
     static_assert(sizeof(insn) == E::plt_size);
     memcpy(buf, insn, sizeof(insn));
-    *(ul32 *)(buf + 8) = sym.get_gotplt_addr(ctx);
-    *(ul32 *)(buf + 12) = sym.get_plt_idx(ctx) * sizeof(ElfRel<E>);
+    *(U32<E> *)(buf + 8) = sym.get_gotplt_addr(ctx);
+    *(U32<E> *)(buf + 12) = sym.get_plt_idx(ctx) * sizeof(ElfRel<E>);
   }
 }
 
 template <>
 void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
   if (ctx.arg.pic) {
-    static const u8 insn[] = {
-      0x01, 0xd0, //    mov.l   1f, r0
-      0xce, 0x00, //    mov.l   @(r0, r12), r0
-      0x2b, 0x40, //    jmp     @r0
-      0x09, 0x00, //    nop
-      0, 0, 0, 0, // 1: .long GOT_ENTRY
+    constexpr U16<E> insn[] = {
+      0xd001, //    mov.l   1f, r0
+      0x00ce, //    mov.l   @(r0, r12), r0
+      0x402b, //    jmp     @r0
+      0x0009, //    nop
+      0, 0,   // 1: .long GOT_ENTRY
     };
 
     static_assert(sizeof(insn) == E::pltgot_size);
     memcpy(buf, insn, sizeof(insn));
-    *(ul32 *)(buf + 8) = sym.get_got_pltgot_addr(ctx) - ctx.got->shdr.sh_addr;
+    *(U32<E> *)(buf + 8) = sym.get_got_pltgot_addr(ctx) - ctx.got->shdr.sh_addr;
   } else {
-    static const u8 insn[] = {
-      0x01, 0xd0, //    mov.l   1f, r0
-      0x02, 0x60, //    mov.l   @r0, r0
-      0x2b, 0x40, //    jmp     @r0
-      0x09, 0x00, //    nop
-      0, 0, 0, 0, // 1: .long GOT_ENTRY
+    constexpr U16<E> insn[] = {
+      0xd001, //    mov.l   1f, r0
+      0x6002, //    mov.l   @r0, r0
+      0x402b, //    jmp     @r0
+      0x0009, //    nop
+      0, 0,   // 1: .long GOT_ENTRY
     };
 
     static_assert(sizeof(insn) == E::pltgot_size);
     memcpy(buf, insn, sizeof(insn));
-    *(ul32 *)(buf + 8) = sym.get_got_pltgot_addr(ctx);
+    *(U32<E> *)(buf + 8) = sym.get_got_pltgot_addr(ctx);
   }
 }
 
@@ -216,10 +218,10 @@ void EhFrameSection<E>::apply_eh_reloc(Context<E> &ctx, const ElfRel<E> &rel,
   case R_NONE:
     break;
   case R_SH_DIR32:
-    *(ul32 *)loc = val;
+    *(U32<E> *)loc = val;
     break;
   case R_SH_REL32:
-    *(ul32 *)loc = val - this->shdr.sh_addr - offset;
+    *(U32<E> *)loc = val - this->shdr.sh_addr - offset;
     break;
   default:
     Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
@@ -249,31 +251,31 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       break;
     case R_SH_REL32:
     case R_SH_PLT32:
-      *(ul32 *)loc = S + A - P;
+      *(U32<E> *)loc = S + A - P;
       break;
     case R_SH_GOT32:
-      *(ul32 *)loc = G;
+      *(U32<E> *)loc = G;
       break;
     case R_SH_GOTPC:
-      *(ul32 *)loc = GOT + A - P;
+      *(U32<E> *)loc = GOT + A - P;
       break;
     case R_SH_GOTOFF:
-      *(ul32 *)loc = S + A - GOT;
+      *(U32<E> *)loc = S + A - GOT;
       break;
     case R_SH_TLS_GD_32:
-      *(ul32 *)loc = sym.get_tlsgd_addr(ctx) + A - GOT;
+      *(U32<E> *)loc = sym.get_tlsgd_addr(ctx) + A - GOT;
       break;
     case R_SH_TLS_LD_32:
-      *(ul32 *)loc = ctx.got->get_tlsld_addr(ctx) + A - GOT;
+      *(U32<E> *)loc = ctx.got->get_tlsld_addr(ctx) + A - GOT;
       break;
     case R_SH_TLS_LDO_32:
-      *(ul32 *)loc = S + A - ctx.dtp_addr;
+      *(U32<E> *)loc = S + A - ctx.dtp_addr;
       break;
     case R_SH_TLS_IE_32:
-      *(ul32 *)loc = sym.get_gottp_addr(ctx) + A - GOT;
+      *(U32<E> *)loc = sym.get_gottp_addr(ctx) + A - GOT;
       break;
     case R_SH_TLS_LE_32:
-      *(ul32 *)loc = S + A - ctx.tp_addr;
+      *(U32<E> *)loc = S + A - ctx.tp_addr;
       break;
     default:
       unreachable();
@@ -303,9 +305,9 @@ void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
     switch (rel.r_type) {
     case R_SH_DIR32:
       if (std::optional<u64> val = get_tombstone(sym, frag))
-        *(ul32 *)loc = *val;
+        *(U32<E> *)loc = *val;
       else
-        *(ul32 *)loc = S + A;
+        *(U32<E> *)loc = S + A;
       break;
     default:
       Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
@@ -364,3 +366,5 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
 }
 
 } // namespace mold
+
+#endif
diff --git a/src/arch-sparc64.cc b/src/arch-sparc64.cc
index b04bb3011a..713f877459 100644
--- a/src/arch-sparc64.cc
+++ b/src/arch-sparc64.cc
@@ -159,8 +159,8 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
 
     u64 S = sym.get_addr(ctx);
     u64 A = rel.r_addend;
-    u64 P = (get_addr() + rel.r_offset);
-    u64 G = (sym.get_got_idx(ctx) * sizeof(Word<E>));
+    u64 P = get_addr() + rel.r_offset;
+    u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
     u64 GOT = ctx.got->shdr.sh_addr;
 
     switch (rel.r_type) {
@@ -489,12 +489,10 @@ void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
         *(ub64 *)loc = S + A;
       break;
     case R_SPARC_32:
-    case R_SPARC_UA32: {
-      i64 val = S + A;
-      check(val, 0, 1LL << 32);
-      *(ub32 *)loc = val;
+    case R_SPARC_UA32:
+      check(S + A, 0, 1LL << 32);
+      *(ub32 *)loc = S + A;
       break;
-    }
     case R_SPARC_TLS_DTPOFF32:
       *(ub32 *)loc = S + A - ctx.dtp_addr;
       break;
diff --git a/src/arch-x86-64.cc b/src/arch-x86-64.cc
index 4e0b5f9353..685996f820 100644
--- a/src/arch-x86-64.cc
+++ b/src/arch-x86-64.cc
@@ -23,8 +23,7 @@
 // TLS block as TP (with some addend). As a result, offsets from TP to
 // thread-local variables (TLVs) in the main executable are all negative.
 //
-// https://github.com/rui314/psabi/blob/main/x86-64.pdf
-// https://github.com/rui314/psabi/blob/main/i386.pdf
+// https://gitlab.com/x86-psABIs/x86-64-ABI
 
 #include "mold.h"
 
diff --git a/src/cmdline.cc b/src/cmdline.cc
index 3e4aaed6d2..638d2e8de2 100644
--- a/src/cmdline.cc
+++ b/src/cmdline.cc
@@ -1,13 +1,17 @@
 #include "mold.h"
 
+#include <filesystem>
 #include <random>
 #include <regex>
 #include <sstream>
-#include <sys/stat.h>
-#include <sys/types.h>
+#include <system_error>
 #include <tbb/global_control.h>
 #include <unordered_set>
 
+#if __has_include(<sys/utsname.h>)
+# include <sys/utsname.h>
+#endif
+
 #if __has_include(<unistd.h>)
 # include <unistd.h>
 #else
@@ -127,7 +131,8 @@ static const char helpmsg[] = R"(
   --oformat=binary            Omit ELF, section, and program headers
   --pack-dyn-relocs=[relr,none]
                               Pack dynamic relocations
-  --package-metadata=STRING   Set a given string to .note.package
+  --package-metadata=PERCENT_ENCODED_STRING
+                              Set a given string to .note.package
   --perf                      Print performance statistics
   --pie, --pic-executable     Create a position-independent executable
     --no-pie, --no-pic-executable
@@ -226,8 +231,8 @@ static const char helpmsg[] = R"(
     -z notext
     -z textoff
 
-mold: supported targets: elf32-i386 elf64-x86-64 elf32-littlearm elf64-littleaarch64 elf32-littleriscv elf32-bigriscv elf64-littleriscv elf64-bigriscv elf32-powerpc elf64-powerpc elf64-powerpc elf64-powerpcle elf64-s390 elf64-sparc elf32-m68k elf32-sh-linux elf64-loongarch elf32-loongarch
-mold: supported emulations: elf_i386 elf_x86_64 armelf_linux_eabi aarch64linux aarch64elf elf32lriscv elf32briscv elf64lriscv elf64briscv elf32ppc elf32ppclinux elf64ppc elf64lppc elf64_s390 elf64_sparc m68kelf shlelf_linux elf64loongarch elf32loongarch)";
+mold: supported targets: elf32-i386 elf64-x86-64 elf32-littlearm elf64-littleaarch64 elf64-bigaarch64 elf32-littleriscv elf32-bigriscv elf64-littleriscv elf64-bigriscv elf32-powerpc elf64-powerpc elf64-powerpc elf64-powerpcle elf64-s390 elf64-sparc elf32-m68k elf32-sh-linux elf64-loongarch elf32-loongarch
+mold: supported emulations: elf_i386 elf_x86_64 armelf_linux_eabi aarch64elf aarch64linux aarch64elfb aarch64linuxb elf32lriscv elf32briscv elf64lriscv elf64briscv elf32ppc elf32ppclinux elf64ppc elf64lppc elf64_s390 elf64_sparc m68kelf shlelf_linux shelf_linux elf64loongarch elf32loongarch)";
 
 // @file
 //
@@ -447,7 +452,7 @@ parse_encoded_package_metadata(Context<E> &ctx, std::string_view arg) {
   static std::regex re(R"(([^%]|%[0-9a-fA-F][0-9a-fA-F])*)", flags);
 
   if (!std::regex_match(arg.begin(), arg.end(), re))
-    Fatal(ctx) << "--encoded-package-metadata: invalid string: " << arg;
+    Fatal(ctx) << "--package-metadata: invalid string: " << arg;
 
   std::ostringstream out;
   while (!arg.empty()) {
@@ -505,10 +510,9 @@ static void read_retain_symbols_file(Context<E> &ctx, std::string_view path) {
   ctx.arg.retain_symbols_file = std::move(vec);
 }
 
-static bool is_file(std::string_view path) {
-  struct stat st;
-  return stat(std::string(path).c_str(), &st) == 0 &&
-         (st.st_mode & S_IFMT) != S_IFDIR;
+static bool is_file(const std::filesystem::path& path) {
+  std::error_code error;
+  return !std::filesystem::is_directory(path, error) && !error;
 }
 
 // Example: TEXT DATA .bss foo=0xabcdef 10000 !bar
@@ -588,6 +592,33 @@ parse_defsym_value(Context<E> &ctx, std::string_view s) {
   return get_symbol(ctx, s);
 }
 
+// Parses a kernel version string, e.g. "6.8.0-47-generic".
+static std::tuple<int, int, int>
+parse_kernel_version(std::string str) {
+  auto flags = std::regex_constants::optimize | std::regex_constants::ECMAScript;
+  static std::regex re(R"(^(\d+)\.(\d+)\.(\d+))", flags);
+  std::smatch m;
+
+  if (!std::regex_search(str, m, re))
+    return {0, 0, 0};
+  return {std::stoi(m[1]), std::stoi(m[2]), std::stoi(m[3])};
+}
+
+// Version 6.11 and 6.12 of the Linux kernel does not return ETXTBSY for
+// open(2) on an executable file that is currently running. This function
+// returns true if we are running on a Linux kernel older than 6.11 or newer
+// than 6.12.
+static bool returns_etxtbsy() {
+#if HAVE_UNAME
+  struct utsname buf;
+  if (uname(&buf) == 0 && strcmp(buf.sysname, "Linux") == 0) {
+    std::tuple<int, int, int> ver = parse_kernel_version(buf.release);
+    return ver < std::tuple{6, 11, 0} || std::tuple{6, 13, 0} <= ver;
+  }
+#endif
+  return false;
+}
+
 template <typename E>
 std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
   // std::span
@@ -616,6 +647,7 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
   std::optional<SeparateCodeKind> z_separate_code;
   std::optional<bool> report_undefined;
   std::optional<bool> z_relro;
+  std::optional<bool> z_dynamic_undefined_weak;
   std::optional<std::string> separate_debug_file;
   std::optional<u64> shuffle_sections_seed;
   std::unordered_set<std::string_view> rpaths;
@@ -788,11 +820,12 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
       //   Display the version number for ld. The -V option also lists the supported emulations.
       Out(ctx) << get_mold_version()
                << "\n  Supported emulations:\n   elf_x86_64\n   elf_i386\n"
-               << "   aarch64linux\n   armelf_linux_eabi\n   elf64lriscv\n"
+               << "   aarch64elf\n   aarch64linux\n   aarch64elfb\n"
+               << "   aarch64linuxb\n   armelf_linux_eabi\n   elf64lriscv\n"
                << "   elf64briscv\n   elf32lriscv\n   elf32briscv\n"
                << "   elf32ppc\n   elf64ppc\n   elf64lppc\n   elf64_s390\n"
                << "   elf64_sparc\n   m68kelf\n   shlelf_linux\n"
-               << "   elf64loongarch\n   elf32loongarch";
+               << "   shelf_linux\n   elf64loongarch\n   elf32loongarch";
       version_shown = true;
     } else if (read_arg("m")) {
       // -m target: Choose a target.
@@ -809,8 +842,10 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
         ctx.arg.emulation = X86_64::name;
       } else if (arg == "elf_i386") {
         ctx.arg.emulation = I386::name;
-      } else if (arg == "aarch64linux") {
-        ctx.arg.emulation = ARM64::name;
+      } else if (arg == "aarch64elf" || arg == "aarch64linux") {
+        ctx.arg.emulation = ARM64LE::name;
+      } else if (arg == "aarch64elfb" || arg == "aarch64linuxb") {
+        ctx.arg.emulation = ARM64BE::name;
       } else if (arg == "armelf_linux_eabi") {
         ctx.arg.emulation = ARM32::name;
       } else if (arg == "elf64lriscv") {
@@ -833,8 +868,10 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
         ctx.arg.emulation = SPARC64::name;
       } else if (arg == "m68kelf") {
         ctx.arg.emulation = M68K::name;
-      } else if (arg == "shlelf_linux") {
-        ctx.arg.emulation = SH4::name;
+      } else if (arg == "shlelf" || arg == "shlelf_linux") {
+        ctx.arg.emulation = SH4LE::name;
+      } else if (arg == "shelf" || arg == "shelf_linux") {
+        ctx.arg.emulation = SH4BE::name;
       } else if (arg == "elf64loongarch") {
         ctx.arg.emulation = LOONGARCH64::name;
       } else if (arg == "elf32loongarch") {
@@ -843,7 +880,7 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
         Fatal(ctx) << "unknown -m argument: " << arg;
       }
     } else if (read_flag("end-lib")) {
-      remaining.push_back("--end-lib");
+      remaining.emplace_back("--end-lib");
     } else if (read_flag("export-dynamic") || read_flag("E")) {
       ctx.arg.export_dynamic = true;
     } else if (read_flag("no-export-dynamic")) {
@@ -883,10 +920,10 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
       ctx.arg.print_map = true;
     } else if (read_flag("Bstatic") || read_flag("dn") || read_flag("static")) {
       ctx.arg.static_ = true;
-      remaining.push_back("--Bstatic");
+      remaining.emplace_back("--Bstatic");
     } else if (read_flag("Bdynamic") || read_flag("dy")) {
       ctx.arg.static_ = false;
-      remaining.push_back("--Bdynamic");
+      remaining.emplace_back("--Bdynamic");
     } else if (read_flag("shared") || read_flag("Bshareable")) {
       ctx.arg.shared = true;
     } else if (read_arg("spare-dynamic-tags")) {
@@ -895,7 +932,7 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
       ctx.arg.spare_program_headers
         = parse_number(ctx, "spare-program-headers", arg);
     } else if (read_flag("start-lib")) {
-      remaining.push_back("--start-lib");
+      remaining.emplace_back("--start-lib");
     } else if (read_flag("start-stop")) {
       ctx.arg.start_stop = true;
     } else if (read_arg("dependency-file")) {
@@ -967,7 +1004,7 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
        * usually make any difference because the compiler driver always passes all necessary search
        * paths to the linker.
        */
-      ctx.arg.library_paths.push_back(std::string(arg));
+      ctx.arg.library_paths.emplace_back(arg);
     } else if (read_arg("sysroot")) {
       // --sysroot=directory
       //    Use directory as the location of the sysroot, overriding the configure-time default.
@@ -1040,8 +1077,7 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
       } else {
         Fatal(ctx) << "invalid --hash-style argument: " << arg;
       }
-    } else if (read_arg("soname") ||
-               read_arg("h")) {
+    } else if (read_arg("soname") || read_arg("h")) {
       ctx.arg.soname = arg;
     } else if (read_flag("allow-multiple-definition")) {
       ctx.arg.allow_multiple_definition = true;
@@ -1110,10 +1146,8 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
     } else if (read_flag("pack-dyn-relocs=none") ||
                read_z_flag("nopack-relative-relocs")) {
       ctx.arg.pack_dyn_relocs_relr = false;
-    } else if (read_arg("encoded-package-metadata")) {
-      ctx.arg.package_metadata = parse_encoded_package_metadata(ctx, arg);
     } else if (read_arg("package-metadata")) {
-      ctx.arg.package_metadata = arg;
+      ctx.arg.package_metadata = parse_encoded_package_metadata(ctx, arg);
     } else if (read_flag("stats")) {
       ctx.arg.stats = true;
       Counter::enabled = true;
@@ -1282,9 +1316,9 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
     } else if (read_z_arg("stack-size")) {
       ctx.arg.z_stack_size = parse_number(ctx, "-z stack-size", arg);
     } else if (read_z_flag("dynamic-undefined-weak")) {
-      ctx.arg.z_dynamic_undefined_weak = true;
+      z_dynamic_undefined_weak = true;
     } else if (read_z_flag("nodynamic-undefined-weak")) {
-      ctx.arg.z_dynamic_undefined_weak = false;
+      z_dynamic_undefined_weak = false;
     } else if (read_z_flag("sectionheader")) {
       ctx.arg.z_sectionheader = true;
     } else if (read_z_flag("nosectionheader")) {
@@ -1366,25 +1400,23 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
       // just copy in the newest one.
       ctx.arg.plugin = arg;
     } else if (read_arg("plugin-opt")) {
-      ctx.arg.plugin_opt.push_back(std::string(arg));
+      ctx.arg.plugin_opt.emplace_back(arg);
     } else if (read_flag("lto-cs-profile-generate")) {
-      ctx.arg.plugin_opt.push_back("cs-profile-generate");
+      ctx.arg.plugin_opt.emplace_back("cs-profile-generate");
     } else if (read_arg("lto-cs-profile-file")) {
       ctx.arg.plugin_opt.push_back("cs-profile-path=" + std::string(arg));
     } else if (read_flag("lto-debug-pass-manager")) {
-      ctx.arg.plugin_opt.push_back("debug-pass-manager");
+      ctx.arg.plugin_opt.emplace_back("debug-pass-manager");
     } else if (read_flag("disable-verify")) {
-      ctx.arg.plugin_opt.push_back("disable-verify");
+      ctx.arg.plugin_opt.emplace_back("disable-verify");
     } else if (read_flag("lto-emit-asm")) {
-      ctx.arg.plugin_opt.push_back("emit-asm");
-    } else if (read_arg("thinlto-jobs")) {
-      ctx.arg.plugin_opt.push_back("jobs=" + std::string(arg));
+      ctx.arg.plugin_opt.emplace_back("emit-asm");
     } else if (read_flag("no-legacy-pass-manager")) {
-      ctx.arg.plugin_opt.push_back("legacy-pass-manager");
+      ctx.arg.plugin_opt.emplace_back("legacy-pass-manager");
     } else if (read_arg("lto-partitions")) {
       ctx.arg.plugin_opt.push_back("lto-partitions=" + std::string(arg));
     } else if (read_flag("no-lto-legacy-pass-manager")) {
-      ctx.arg.plugin_opt.push_back("new-pass-manager");
+      ctx.arg.plugin_opt.emplace_back("new-pass-manager");
     } else if (read_arg("lto-obj-path")) {
       ctx.arg.plugin_opt.push_back("obj-path=" + std::string(arg));
     } else if (read_arg("opt-remarks-filename")) {
@@ -1397,7 +1429,7 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
     } else if (read_arg("opt-remarks-passes")) {
       ctx.arg.plugin_opt.push_back("opt-remarks-passes=" + std::string(arg));
     } else if (read_flag("opt-remarks-with_hotness")) {
-      ctx.arg.plugin_opt.push_back("opt-remarks-with-hotness");
+      ctx.arg.plugin_opt.emplace_back("opt-remarks-with-hotness");
     } else if (args[0].starts_with("-lto-O")) {
       ctx.arg.plugin_opt.push_back("O" + std::string(args[0].substr(6)));
       args = args.subspan(1);
@@ -1410,13 +1442,13 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
     } else if (read_arg("lto-sample-profile")) {
       ctx.arg.plugin_opt.push_back("sample-profile=" + std::string(arg));
     } else if (read_flag("save-temps")) {
-      ctx.arg.plugin_opt.push_back("save-temps");
+      ctx.arg.plugin_opt.emplace_back("save-temps");
     } else if (read_flag("thinlto-emit-imports-files")) {
-      ctx.arg.plugin_opt.push_back("thinlto-emit-imports-files");
+      ctx.arg.plugin_opt.emplace_back("thinlto-emit-imports-files");
     } else if (read_arg("thinlto-index-only")) {
       ctx.arg.plugin_opt.push_back("thinlto-index-only=" + std::string(arg));
     } else if (read_flag("thinlto-index-only")) {
-      ctx.arg.plugin_opt.push_back("thinlto-index-only");
+      ctx.arg.plugin_opt.emplace_back("thinlto-index-only");
     } else if (read_arg("thinlto-object-suffix-replace")) {
       ctx.arg.plugin_opt.push_back("thinlto-object-suffix-replace=" +
                                    std::string(arg));
@@ -1551,21 +1583,21 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
     } else if (read_arg("export-dynamic-symbol-list")) {
       append(ctx.dynamic_list_patterns, parse_dynamic_list(ctx, arg));
     } else if (read_flag("as-needed")) {
-      remaining.push_back("--as-needed");
+      remaining.emplace_back("--as-needed");
     } else if (read_flag("no-as-needed")) {
-      remaining.push_back("--no-as-needed");
+      remaining.emplace_back("--no-as-needed");
     } else if (read_flag("whole-archive")) {
-      remaining.push_back("--whole-archive");
+      remaining.emplace_back("--whole-archive");
     } else if (read_flag("no-whole-archive")) {
-      remaining.push_back("--no-whole-archive");
+      remaining.emplace_back("--no-whole-archive");
     } else if (read_arg("l") || read_arg("library")) {
       remaining.push_back("-l" + std::string(arg));
     } else if (read_arg("script") || read_arg("T")) {
-      remaining.push_back(std::string(arg));
+      remaining.emplace_back(arg);
     } else if (read_flag("push-state")) {
-      remaining.push_back("--push-state");
+      remaining.emplace_back("--push-state");
     } else if (read_flag("pop-state")) {
-      remaining.push_back("--pop-state");
+      remaining.emplace_back("--pop-state");
     } else if (args[0].starts_with("-z") && args[0].size() > 2) {
       Warn(ctx) << "unknown command line option: " << args[0];
       args = args.subspan(1);
@@ -1578,7 +1610,7 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
     } else {
       if (args[0].starts_with('-'))
         Fatal(ctx) << "unknown command line option: " << args[0];
-      remaining.push_back(std::string(args[0]));
+      remaining.emplace_back(args[0]);
       args = args.subspan(1);
     }
   }
@@ -1642,6 +1674,12 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
   else if (!ctx.arg.section_order.empty())
     ctx.arg.z_separate_code = SEPARATE_LOADABLE_SEGMENTS;
 
+  // `-z dynamic-undefined-weak` is enabled by default for DSOs.
+  if (z_dynamic_undefined_weak)
+    ctx.arg.z_dynamic_undefined_weak = *z_dynamic_undefined_weak;
+  else
+    ctx.arg.z_dynamic_undefined_weak = ctx.arg.shared;
+
   // --section-order implies `-z norelro`
   if (z_relro)
     ctx.arg.z_relro = *z_relro;
@@ -1743,8 +1781,7 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
   // However, that mechanism doesn't protect .so files. Therefore, we
   // want to disable this optimization if we are creating a shared
   // object file.
-  if (ctx.arg.shared)
-    ctx.overwrite_output_file = false;
+  ctx.overwrite_output_file = (!ctx.arg.shared && returns_etxtbsy());
 
   if (!ctx.arg.chroot.empty()) {
     if (!ctx.arg.Map.empty())
diff --git a/src/config.cc b/src/config.cc
index af578ab821..cf8edd62a0 100644
--- a/src/config.cc
+++ b/src/config.cc
@@ -1,5 +1,4 @@
 #include "mold.h"
-#include "config.h"
 
 namespace mold {
 
diff --git a/src/elf.cc b/src/elf.cc
index 8f78df67bc..88f6cd2da3 100644
--- a/src/elf.cc
+++ b/src/elf.cc
@@ -53,6 +53,15 @@ std::string rel_to_string<X86_64>(u32 r_type) {
   CASE(R_X86_64_IRELATIVE);
   CASE(R_X86_64_GOTPCRELX);
   CASE(R_X86_64_REX_GOTPCRELX);
+  CASE(R_X86_64_CODE_4_GOTPCRELX);
+  CASE(R_X86_64_CODE_4_GOTTPOFF);
+  CASE(R_X86_64_CODE_4_GOTPC32_TLSDESC);
+  CASE(R_X86_64_CODE_5_GOTPCRELX);
+  CASE(R_X86_64_CODE_5_GOTTPOFF);
+  CASE(R_X86_64_CODE_5_GOTPC32_TLSDESC);
+  CASE(R_X86_64_CODE_6_GOTPCRELX);
+  CASE(R_X86_64_CODE_6_GOTTPOFF);
+  CASE(R_X86_64_CODE_6_GOTPC32_TLSDESC);
   }
   return unknown_type(r_type);
 }
@@ -107,7 +116,7 @@ std::string rel_to_string<I386>(u32 r_type) {
 }
 
 template <>
-std::string rel_to_string<ARM64>(u32 r_type) {
+std::string rel_to_string<ARM64LE>(u32 r_type) {
   switch (r_type) {
   CASE(R_AARCH64_NONE);
   CASE(R_AARCH64_ABS64);
@@ -217,6 +226,11 @@ std::string rel_to_string<ARM64>(u32 r_type) {
   return unknown_type(r_type);
 }
 
+template <>
+std::string rel_to_string<ARM64BE>(u32 r_type) {
+  return rel_to_string<ARM64LE>(r_type);
+}
+
 template <>
 std::string rel_to_string<ARM32>(u32 r_type) {
   switch (r_type) {
@@ -402,7 +416,6 @@ std::string rel_to_string<RV64LE>(u32 r_type) {
   CASE(R_RISCV_ALIGN);
   CASE(R_RISCV_RVC_BRANCH);
   CASE(R_RISCV_RVC_JUMP);
-  CASE(R_RISCV_RVC_LUI);
   CASE(R_RISCV_RELAX);
   CASE(R_RISCV_SUB6);
   CASE(R_RISCV_SET6);
@@ -615,6 +628,7 @@ std::string rel_to_string<PPC64V1>(u32 r_type) {
   CASE(R_PPC64_DTPREL16_HIGH);
   CASE(R_PPC64_DTPREL16_HIGHA);
   CASE(R_PPC64_REL24_NOTOC);
+  CASE(R_PPC64_ENTRY);
   CASE(R_PPC64_PLTSEQ);
   CASE(R_PPC64_PLTCALL);
   CASE(R_PPC64_PLTSEQ_NOTOC);
@@ -857,7 +871,7 @@ std::string rel_to_string<M68K>(u32 r_type) {
 }
 
 template <>
-std::string rel_to_string<SH4>(u32 r_type) {
+std::string rel_to_string<SH4LE>(u32 r_type) {
   switch (r_type) {
   CASE(R_SH_NONE);
   CASE(R_SH_DIR32);
@@ -890,6 +904,11 @@ std::string rel_to_string<SH4>(u32 r_type) {
   return unknown_type(r_type);
 }
 
+template <>
+std::string rel_to_string<SH4BE>(u32 r_type) {
+  return rel_to_string<SH4LE>(r_type);
+}
+
 template <>
 std::string rel_to_string<LOONGARCH64>(u32 r_type) {
   switch (r_type) {
diff --git a/src/elf.h b/src/elf.h
index 3f35f82b5b..458945d100 100644
--- a/src/elf.h
+++ b/src/elf.h
@@ -12,7 +12,8 @@ namespace mold {
 // see CMakeLists.txt:mold_instantiate_templates
 struct X86_64;
 struct I386;
-struct ARM64;
+struct ARM64LE;
+struct ARM64BE;
 struct ARM32;
 struct RV64LE;
 struct RV64BE;
@@ -24,7 +25,8 @@ struct PPC64V2;
 struct S390X;
 struct SPARC64;
 struct M68K;
-struct SH4;
+struct SH4LE;
+struct SH4BE;
 struct LOONGARCH64;
 struct LOONGARCH32;
 
@@ -50,10 +52,6 @@ std::ostream &operator<<(std::ostream &out, const ElfRel<E> &rel) {
   return out;
 }
 
-enum : u32 {
-  R_NONE = 0,
-};
-
 enum : u32 {
   SHN_UNDEF = 0,
   SHN_LORESERVE = 0xff00,
@@ -427,6 +425,10 @@ enum : u32 {
 // Relocation types
 //
 
+enum : u32 {
+  R_NONE = 0,
+};
+
 enum : u32 {
   R_X86_64_NONE = 0,
   R_X86_64_64 = 1,
@@ -468,6 +470,15 @@ enum : u32 {
   R_X86_64_IRELATIVE = 37,
   R_X86_64_GOTPCRELX = 41,
   R_X86_64_REX_GOTPCRELX = 42,
+  R_X86_64_CODE_4_GOTPCRELX = 43,
+  R_X86_64_CODE_4_GOTTPOFF = 44,
+  R_X86_64_CODE_4_GOTPC32_TLSDESC = 45,
+  R_X86_64_CODE_5_GOTPCRELX = 46,
+  R_X86_64_CODE_5_GOTTPOFF = 47,
+  R_X86_64_CODE_5_GOTPC32_TLSDESC = 48,
+  R_X86_64_CODE_6_GOTPCRELX = 49,
+  R_X86_64_CODE_6_GOTTPOFF = 50,
+  R_X86_64_CODE_6_GOTPC32_TLSDESC = 51,
 };
 
 enum : u32 {
@@ -802,10 +813,6 @@ enum : u32 {
   R_RISCV_ALIGN = 43,
   R_RISCV_RVC_BRANCH = 44,
   R_RISCV_RVC_JUMP = 45,
-  R_RISCV_RVC_LUI = 46,
-  R_RISCV_GPREL_LO12_I = 47,
-  R_RISCV_GPREL_LO12_S = 48,
-  R_RISCV_GPREL_HI20 = 49,
   R_RISCV_RELAX = 51,
   R_RISCV_SUB6 = 52,
   R_RISCV_SET6 = 53,
@@ -995,6 +1002,7 @@ enum : u32 {
   R_PPC64_DTPREL16_HIGH = 114,
   R_PPC64_DTPREL16_HIGHA = 115,
   R_PPC64_REL24_NOTOC = 116,
+  R_PPC64_ENTRY = 118,
   R_PPC64_PLTSEQ = 119,
   R_PPC64_PLTCALL = 120,
   R_PPC64_PLTSEQ_NOTOC = 121,
@@ -1782,7 +1790,7 @@ struct ElfRel<SPARC64> {
 };
 
 template <>
-struct ElfRel<SH4> {
+struct ElfRel<SH4LE> {
   ElfRel() = default;
 
   // Addend is ignored except for base relocations because even though
@@ -1797,6 +1805,18 @@ struct ElfRel<SH4> {
   il32 r_addend;
 };
 
+template <>
+struct ElfRel<SH4BE> {
+  ElfRel() = default;
+  ElfRel(u64 offset, u32 type, u32 sym, i64 addend)
+    : r_offset(offset), r_sym(sym), r_type(type), r_addend(sym ? 0 : addend) {}
+
+  ub32 r_offset;
+  ub24 r_sym;
+  u8 r_type;
+  ib32 r_addend;
+};
+
 //
 // Machine descriptions
 //
@@ -1807,7 +1827,8 @@ template <typename E> concept needs_thunk = requires { E::thunk_size; };
 
 template <typename E> concept is_x86_64 = std::same_as<E, X86_64>;
 template <typename E> concept is_i386 = std::same_as<E, I386>;
-template <typename E> concept is_arm64 = std::same_as<E, ARM64>;
+template <typename E> concept is_arm64le = std::same_as<E, ARM64LE>;
+template <typename E> concept is_arm64be = std::same_as<E, ARM64BE>;
 template <typename E> concept is_arm32 = std::same_as<E, ARM32>;
 template <typename E> concept is_rv64le = std::same_as<E, RV64LE>;
 template <typename E> concept is_rv64be = std::same_as<E, RV64BE>;
@@ -1819,11 +1840,13 @@ template <typename E> concept is_ppc64v2 = std::same_as<E, PPC64V2>;
 template <typename E> concept is_s390x = std::same_as<E, S390X>;
 template <typename E> concept is_sparc64 = std::same_as<E, SPARC64>;
 template <typename E> concept is_m68k = std::same_as<E, M68K>;
-template <typename E> concept is_sh4 = std::same_as<E, SH4>;
+template <typename E> concept is_sh4le = std::same_as<E, SH4LE>;
+template <typename E> concept is_sh4be = std::same_as<E, SH4BE>;
 template <typename E> concept is_loongarch64 = std::same_as<E, LOONGARCH64>;
 template <typename E> concept is_loongarch32 = std::same_as<E, LOONGARCH32>;
 
 template <typename E> concept is_x86 = is_x86_64<E> || is_i386<E>;
+template <typename E> concept is_arm64 = is_arm64le<E> || is_arm64be<E>;
 template <typename E> concept is_arm = is_arm64<E> || is_arm32<E>;
 template <typename E> concept is_rv64 = is_rv64le<E> || is_rv64be<E>;
 template <typename E> concept is_rv32 = is_rv32le<E> || is_rv32be<E>;
@@ -1831,6 +1854,7 @@ template <typename E> concept is_riscv = is_rv64<E> || is_rv32<E>;
 template <typename E> concept is_ppc64 = is_ppc64v1<E> || is_ppc64v2<E>;
 template <typename E> concept is_ppc = is_ppc64<E> || is_ppc32<E>;
 template <typename E> concept is_sparc = is_sparc64<E>;
+template <typename E> concept is_sh4 = is_sh4le<E> || is_sh4be<E>;
 template <typename E> concept is_loongarch = is_loongarch64<E> || is_loongarch32<E>;
 
 struct X86_64 {
@@ -1883,7 +1907,7 @@ struct I386 {
   static constexpr u32 R_FUNCALL[] = { R_386_PLT32 };
 };
 
-struct ARM64 {
+struct ARM64LE {
   static constexpr std::string_view name = "arm64";
   static constexpr bool is_64 = true;
   static constexpr bool is_le = true;
@@ -1910,12 +1934,17 @@ struct ARM64 {
   static constexpr u32 R_FUNCALL[] = { R_AARCH64_JUMP26, R_AARCH64_CALL26 };
 };
 
+struct ARM64BE : ARM64LE {
+  static constexpr std::string_view name = "arm64be";
+  static constexpr bool is_le = false;
+};
+
 struct ARM32 {
   static constexpr std::string_view name = "arm32";
   static constexpr bool is_64 = false;
   static constexpr bool is_le = true;
   static constexpr bool is_rela = false;
-  static constexpr u32 page_size = 4096;
+  static constexpr u32 page_size = 65536;
   static constexpr u32 e_machine = EM_ARM;
   static constexpr u32 plt_hdr_size = 32;
   static constexpr u32 plt_size = 16;
@@ -1940,8 +1969,10 @@ struct ARM32 {
   };
 };
 
-struct RV64 {
+struct RV64LE {
+  static constexpr std::string_view name = "riscv64";
   static constexpr bool is_64 = true;
+  static constexpr bool is_le = true;
   static constexpr bool is_rela = true;
   static constexpr u32 page_size = 4096;
   static constexpr u32 e_machine = EM_RISCV;
@@ -1963,18 +1994,15 @@ struct RV64 {
   static constexpr u32 R_FUNCALL[] = { R_RISCV_CALL, R_RISCV_CALL_PLT };
 };
 
-struct RV64LE : RV64 {
-  static constexpr std::string_view name = "riscv64";
-  static constexpr bool is_le = true;
-};
-
-struct RV64BE : RV64 {
+struct RV64BE : RV64LE {
   static constexpr std::string_view name = "riscv64be";
   static constexpr bool is_le = false;
 };
 
-struct RV32 {
+struct RV32LE {
+  static constexpr std::string_view name = "riscv32";
   static constexpr bool is_64 = false;
+  static constexpr bool is_le = true;
   static constexpr bool is_rela = true;
   static constexpr u32 page_size = 4096;
   static constexpr u32 e_machine = EM_RISCV;
@@ -1996,12 +2024,7 @@ struct RV32 {
   static constexpr u32 R_FUNCALL[] = { R_RISCV_CALL, R_RISCV_CALL_PLT };
 };
 
-struct RV32LE : RV32 {
-  static constexpr std::string_view name = "riscv32";
-  static constexpr bool is_le = true;
-};
-
-struct RV32BE : RV32 {
+struct RV32BE : RV32LE {
   static constexpr std::string_view name = "riscv32be";
   static constexpr bool is_le = false;
 };
@@ -2145,7 +2168,7 @@ struct M68K {
   static constexpr u32 R_FUNCALL[] = { R_68K_PLT32 };
 };
 
-struct SH4 {
+struct SH4LE {
   static constexpr std::string_view name = "sh4";
   static constexpr bool is_64 = false;
   static constexpr bool is_le = true;
@@ -2168,12 +2191,18 @@ struct SH4 {
   static constexpr u32 R_FUNCALL[] = { R_SH_PLT32 };
 };
 
+struct SH4BE : SH4LE {
+  static constexpr std::string_view name = "sh4be";
+  static constexpr bool is_le = false;
+  static constexpr u8 filler[] = { 0x90, 0x00 }; // nop
+};
+
 struct LOONGARCH64 {
   static constexpr std::string_view name = "loongarch64";
   static constexpr bool is_64 = true;
   static constexpr bool is_le = true;
   static constexpr bool is_rela = true;
-  static constexpr u32 page_size = 16384;
+  static constexpr u32 page_size = 65536;
   static constexpr u32 e_machine = EM_LOONGARCH;
   static constexpr u32 plt_hdr_size = 32;
   static constexpr u32 plt_size = 16;
@@ -2198,7 +2227,7 @@ struct LOONGARCH32 {
   static constexpr bool is_64 = false;
   static constexpr bool is_le = true;
   static constexpr bool is_rela = true;
-  static constexpr u32 page_size = 16384;
+  static constexpr u32 page_size = 65536;
   static constexpr u32 e_machine = EM_LOONGARCH;
   static constexpr u32 plt_hdr_size = 32;
   static constexpr u32 plt_size = 16;
diff --git a/src/filetype.cc b/src/filetype.cc
index da514cc3bc..6186b66e46 100644
--- a/src/filetype.cc
+++ b/src/filetype.cc
@@ -166,7 +166,7 @@ static std::string_view get_elf_type(u8 *buf) {
   case EM_ARM:
     return ARM32::name;
   case EM_AARCH64:
-    return ARM64::name;
+    return is_le ? ARM64LE::name : ARM64BE::name;
   case EM_RISCV:
     if (is_le)
       return is_64 ? RV64LE::name : RV32LE::name;
@@ -182,7 +182,7 @@ static std::string_view get_elf_type(u8 *buf) {
   case EM_68K:
     return M68K::name;
   case EM_SH:
-    return SH4::name;
+    return is_le ? SH4LE::name : SH4BE::name;
   case EM_LOONGARCH:
     return is_64 ? LOONGARCH64::name : LOONGARCH32::name;
   default:
diff --git a/src/gc-sections.cc b/src/gc-sections.cc
index efc6cd6dc7..0aa0a47dbb 100644
--- a/src/gc-sections.cc
+++ b/src/gc-sections.cc
@@ -24,7 +24,6 @@ static bool should_keep(const InputSection<E> &isec) {
          type == SHT_INIT_ARRAY ||
          type == SHT_FINI_ARRAY ||
          type == SHT_PREINIT_ARRAY ||
-         (is_arm32<E> && type == SHT_ARM_EXIDX) ||
          name.starts_with(".ctors") ||
          name.starts_with(".dtors") ||
          name.starts_with(".init") ||
@@ -100,34 +99,36 @@ static void visit(Context<E> &ctx, InputSection<E> *isec,
                   tbb::feeder<InputSection<E> *> &feeder, i64 depth) {
   assert(isec->is_visited);
 
+  // Mark a section alive. For better performacne, we don't call
+  // `feeder.add` too often.
+  auto mark = [&](InputSection<E> *sec) {
+    if (mark_section(sec)) {
+      if (depth < 3)
+        visit(ctx, sec, feeder, depth + 1);
+      else
+        feeder.add(sec);
+    }
+  };
+
   // If this is a text section, .eh_frame may contain records
   // describing how to handle exceptions for that function.
   // We want to keep associated .eh_frame records.
   for (FdeRecord<E> &fde : isec->get_fdes())
     for (const ElfRel<E> &rel : fde.get_rels(isec->file).subspan(1))
       if (Symbol<E> *sym = isec->file.symbols[rel.r_sym])
-        if (mark_section(sym->get_input_section()))
-          feeder.add(sym->get_input_section());
+        mark(sym->get_input_section());
 
   for (const ElfRel<E> &rel : isec->get_rels(ctx)) {
-    Symbol<E> &sym = *isec->file.symbols[rel.r_sym];
-
     // Symbol can refer to either a section fragment or an input section.
-    // Mark a fragment as alive.
-    if (SectionFragment<E> *frag = sym.get_frag()) {
+    Symbol<E> &sym = *isec->file.symbols[rel.r_sym];
+    if (SectionFragment<E> *frag = sym.get_frag())
       frag->is_alive = true;
-      continue;
-    }
-
-    // Mark a section alive. For better performacne, we don't call
-    // `feeder.add` too often.
-    if (mark_section(sym.get_input_section())) {
-      if (depth < 3)
-        visit(ctx, sym.get_input_section(), feeder, depth + 1);
-      else
-        feeder.add(sym.get_input_section());
-    }
+    else
+      mark(sym.get_input_section());
   }
+
+  if constexpr (is_arm32<E>)
+    mark(isec->extra.exidx);
 }
 
 // Mark all reachable sections
diff --git a/src/gdb-index.cc b/src/gdb-index.cc
index a87b769180..759f2eae56 100644
--- a/src/gdb-index.cc
+++ b/src/gdb-index.cc
@@ -58,7 +58,6 @@
 
 #include "mold.h"
 #include <tbb/parallel_for_each.h>
-#include <tbb/parallel_sort.h>
 
 namespace mold {
 
@@ -127,16 +126,10 @@ struct SectionHeader {
 };
 
 struct NameType {
-  bool operator==(const NameType &) const = default;
-
-  bool operator<(const NameType &other) const {
-    return std::tuple(hash, type, name) <
-           std::tuple(other.hash, other.type, other.name);
-  }
-
-  std::string_view name;
+  auto operator<=>(const NameType &) const = default;
   u64 hash;
   u8 type;
+  std::string_view name;
 };
 
 struct MapValue {
@@ -539,7 +532,7 @@ static i64 read_pubnames_cu(Context<E> &ctx, const PubnamesHdr &hdr,
     u8 type = *p++;
     std::string_view name = (char *)p;
     p += name.size() + 1;
-    cu->nametypes.push_back({name, hash_string(name), type});
+    cu->nametypes.push_back(NameType{hash_string(name), type, name});
   }
 
   return size;
@@ -626,7 +619,7 @@ static std::vector<Compunit> read_compunits(Context<E> &ctx) {
 
   // Uniquify elements because GCC 11 seems to emit one record for each
   // comdat group which results in having a lot of duplicate records.
-  tbb::parallel_for_each(cus, [&](Compunit &cu) {
+  tbb::parallel_for_each(cus, [](Compunit &cu) {
     sort(cu.nametypes);
     remove_duplicates(cu.nametypes);
   });
diff --git a/src/icf.cc b/src/icf.cc
index 4ef5fe8235..4fa359a0b4 100644
--- a/src/icf.cc
+++ b/src/icf.cc
@@ -192,7 +192,7 @@ static void merge_leaf_nodes(Context<E> &ctx) {
   static Counter non_eligible("icf_non_eligibles");
   static Counter leaf("icf_leaf_nodes");
 
-  tbb::concurrent_unordered_map<InputSection<E> *, InputSection<E> *,
+  tbb::concurrent_unordered_map<InputSection<E> *, Atomic<InputSection<E> *>,
                                 LeafHasher<E>, LeafEq<E>> map;
 
   tbb::parallel_for((i64)0, (i64)ctx.objs.size(), [&](i64 i) {
@@ -209,8 +209,11 @@ static void merge_leaf_nodes(Context<E> &ctx) {
         leaf++;
         isec->icf_leaf = true;
         auto [it, inserted] = map.insert({isec.get(), isec.get()});
-        if (!inserted && isec->get_priority() < it->second->get_priority())
-          it->second = isec.get();
+        if (!inserted) {
+          InputSection<E> *isec2 = it->second.load();
+          while (isec->get_priority() < isec2->get_priority() &&
+                 !it->second.compare_exchange_strong(isec2, isec.get()));
+        }
       } else {
         eligible++;
         isec->icf_eligible = true;
@@ -465,7 +468,7 @@ static void print_icf_sections(Context<E> &ctx) {
   });
 
   tbb::parallel_sort(leaders.begin(), leaders.end(),
-                     [&](InputSection<E> *a, InputSection<E> *b) {
+                     [](InputSection<E> *a, InputSection<E> *b) {
                        return a->get_priority() < b->get_priority();
                      });
 
@@ -568,14 +571,19 @@ void icf_sections(Context<E> &ctx) {
   {
     Timer t(ctx, "group");
 
-    auto *map = new tbb::concurrent_unordered_map<Digest, InputSection<E> *>;
+    auto *map =
+      new tbb::concurrent_unordered_map<Digest, Atomic<InputSection<E> *>>;
+
     std::span<Digest> digest = digests[slot];
 
     tbb::parallel_for((i64)0, (i64)sections.size(), [&](i64 i) {
       InputSection<E> *isec = sections[i];
       auto [it, inserted] = map->insert({digest[i], isec});
-      if (!inserted && isec->get_priority() < it->second->get_priority())
-        it->second = isec;
+      if (!inserted) {
+        InputSection<E> *isec2 = it->second.load();
+        while (isec->get_priority() < isec2->get_priority() &&
+               !it->second.compare_exchange_strong(isec2, isec));
+      }
     });
 
     tbb::parallel_for((i64)0, (i64)sections.size(), [&](i64 i) {
diff --git a/src/input-files.cc b/src/input-files.cc
index b25d6a6fba..ec853e2f25 100644
--- a/src/input-files.cc
+++ b/src/input-files.cc
@@ -101,7 +101,7 @@ InputFile<E>::InputFile(Context<E> &ctx, MappedFile *mf)
 
 template <typename E>
 std::span<ElfPhdr<E>> InputFile<E>::get_phdrs() {
-  ElfEhdr<E> &ehdr = get_ehdr();
+  ElfEhdr<E> &ehdr = *(ElfEhdr<E> *)mf->data;
   return {(ElfPhdr<E> *)(mf->data + ehdr.e_phoff), ehdr.e_phnum};
 }
 
@@ -123,13 +123,6 @@ std::string_view InputFile<E>::get_source_name() const {
   return "";
 }
 
-template <typename E>
-ObjectFile<E>::ObjectFile(Context<E> &ctx, MappedFile *mf,
-                          std::string archive_name, bool is_in_lib)
-  : InputFile<E>(ctx, mf), archive_name(archive_name), is_in_lib(is_in_lib) {
-  this->is_alive = !is_in_lib;
-}
-
 template <typename E>
 static bool is_debug_section(const ElfShdr<E> &shdr, std::string_view name) {
   return !(shdr.sh_flags & SHF_ALLOC) && name.starts_with(".debug");
@@ -175,7 +168,7 @@ ObjectFile<E>::parse_note_gnu_property(Context<E> &ctx, const ElfShdr<E> &shdr)
 
 // <format-version>
 // [ <section-length> "vendor-name" <file-tag> <size> <attribute>*]+ ]*
-template <typename E>
+template <is_riscv E>
 static void read_riscv_attributes(Context<E> &ctx, ObjectFile<E> &file,
                                   std::string_view data) {
   if (data.empty())
@@ -482,6 +475,13 @@ void ObjectFile<E>::initialize_sections(Context<E> &ctx) {
       target->relsec_idx = i;
     }
   }
+
+  // Attach .arm.exidx sections to their corresponding sections
+  if constexpr (is_arm32<E>)
+    for (std::unique_ptr<InputSection<E>> &isec : this->sections)
+      if (isec && isec->shdr().sh_type == SHT_ARM_EXIDX)
+        if (InputSection<E> *target = sections[isec->shdr().sh_link].get())
+          target->extra.exidx = isec.get();
 }
 
 // .eh_frame contains data records explaining how to handle exceptions.
@@ -661,7 +661,7 @@ void ObjectFile<E>::initialize_symbols(Context<E> &ctx) {
       if (ver != "@" && ver != "@@") {
         if (ver.starts_with("@@"))
           key = name;
-        has_symver.set(i - this->first_global);
+        has_symver[i - this->first_global] = true;
       }
     }
 
@@ -693,7 +693,7 @@ void ObjectFile<E>::initialize_symbols(Context<E> &ctx) {
 template <typename E>
 void ObjectFile<E>::sort_relocations(Context<E> &ctx) {
   if constexpr (is_riscv<E> || is_loongarch<E>) {
-    auto less = [&](const ElfRel<E> &a, const ElfRel<E> &b) {
+    auto less = [](const ElfRel<E> &a, const ElfRel<E> &b) {
       return a.r_offset < b.r_offset;
     };
 
@@ -940,7 +940,7 @@ template <typename E>
 static u64 get_rank(const Symbol<E> &sym) {
   if (!sym.file)
     return 7 << 24;
-  return get_rank(sym.file, sym.esym(), !sym.file->is_alive);
+  return get_rank(sym.file, sym.esym(), !sym.file->is_reachable);
 }
 
 // Symbol's visibility is set to the most restrictive one. For example,
@@ -1002,7 +1002,7 @@ void ObjectFile<E>::resolve_symbols(Context<E> &ctx) {
 
     std::scoped_lock lock(sym.mu);
     // The current ELF symbol's priority is higher than the symbol's priority.
-    if (get_rank(this, esym, !this->is_alive) < get_rank(sym)) {
+    if (get_rank(this, esym, !this->is_reachable) < get_rank(sym)) {
       sym.file = this;
       sym.set_input_section(isec);
       sym.value = esym.st_value;
@@ -1017,7 +1017,7 @@ template <typename E>
 void
 ObjectFile<E>::mark_live_objects(Context<E> &ctx,
                                  std::function<void(InputFile<E> *)> feeder) {
-  assert(this->is_alive);
+  assert(this->is_reachable);
 
   for (i64 i = this->first_global; i < this->elf_syms.size(); i++) {
     const ElfSym<E> &esym = this->elf_syms[i];
@@ -1035,7 +1035,7 @@ ObjectFile<E>::mark_live_objects(Context<E> &ctx,
       bool undef_ref = esym.is_undef() && (!esym.is_weak() || sym.file->is_dso);
       bool common_ref = esym.is_common() && !sym.esym().is_common();
 
-      if ((undef_ref || common_ref) && !sym.file->is_alive.test_and_set()) {
+      if ((undef_ref || common_ref) && !sym.file->is_reachable.test_and_set()) {
         feeder(sym.file);
         if (sym.is_traced)
           Out(ctx) << "trace-symbol: " << *this << " keeps " << *sym.file
@@ -1141,7 +1141,7 @@ static bool should_write_to_local_symtab(Context<E> &ctx, Symbol<E> &sym) {
   // merged, so their origins shouldn't matter, but I don't really
   // know the rationale. Anyway, this is the behavior of the
   // traditional linkers.
-  if (sym.name().starts_with(".L")) {
+  if (sym.name().starts_with(".L") || sym.name() == "L0\001") {
     if (ctx.arg.discard_locals)
       return false;
 
@@ -1157,7 +1157,7 @@ template <typename E>
 void ObjectFile<E>::compute_symtab_size(Context<E> &ctx) {
   this->output_sym_indices.resize(this->elf_syms.size(), -1);
 
-  auto is_alive = [&](Symbol<E> &sym) -> bool {
+  auto is_alive = [](Symbol<E> &sym) -> bool {
     if (SectionFragment<E> *frag = sym.get_frag())
       return frag->is_alive;
     if (InputSection<E> *isec = sym.get_input_section())
@@ -1427,8 +1427,11 @@ SharedFile<E>::mark_live_objects(Context<E> &ctx,
     if (sym.is_traced)
       print_trace_symbol(ctx, *this, esym, sym);
 
+    // We follow undefined symbols in a DSO only to handle
+    // --no-allow-shlib-undefined.
     if (esym.is_undef() && !esym.is_weak() && sym.file &&
-        !sym.file->is_alive.test_and_set()) {
+        (!sym.file->is_dso || !ctx.arg.allow_shlib_undefined) &&
+        !sym.file->is_reachable.test_and_set()) {
       feeder(sym.file);
 
       if (sym.is_traced)
@@ -1456,7 +1459,7 @@ std::span<Symbol<E> *> SharedFile<E>::get_symbols_at(Symbol<E> *sym) {
   });
 
   auto [begin, end] = std::equal_range(sorted_syms.begin(), sorted_syms.end(),
-                                       sym, [&](Symbol<E> *a, Symbol<E> *b) {
+                                       sym, [](Symbol<E> *a, Symbol<E> *b) {
     return a->esym().st_value < b->esym().st_value;
   });
 
diff --git a/src/input-sections.cc b/src/input-sections.cc
index 0dabdac173..3cc5bc7e2a 100644
--- a/src/input-sections.cc
+++ b/src/input-sections.cc
@@ -1,6 +1,5 @@
 #include "mold.h"
 
-#include <limits>
 #include <zlib.h>
 #include <zstd.h>
 
@@ -131,6 +130,11 @@ static void do_action(Context<E> &ctx, Action action, InputSection<E> &isec,
     break;
   case COPYREL:
     // Create a copy relocation
+    if (!sym.file->is_dso) {
+      assert(sym.esym().is_undef_weak());
+      Error(ctx) << isec << ": cannot create a copy relocation for "
+                 << sym <<"; recompile with -fPIE or -fPIC";
+    }
     sym.flags |= NEEDS_COPYREL;
     break;
   case PLT:
@@ -241,27 +245,23 @@ void InputSection<E>::write_to(Context<E> &ctx, u8 *buf) {
   // relocations are allowed to remove bytes from the middle of a
   // section and shrink the overall size of it.
   if constexpr (is_riscv<E> || is_loongarch<E>) {
-    if (extra.r_deltas.empty()) {
+    std::span<RelocDelta> deltas = extra.r_deltas;
+
+    if (deltas.empty()) {
       // If a section is not relaxed, we can copy it as a one big chunk.
       copy_contents(ctx, buf);
     } else {
       // A relaxed section is copied piece-wise.
-      std::span<const ElfRel<E>> rels = get_rels(ctx);
-      u8 *buf2 = buf;
-      i64 pos = 0;
-
-      for (i64 i = 0; i < rels.size(); i++) {
-        i64 delta = extra.r_deltas[i + 1] - extra.r_deltas[i];
-        if (delta == 0)
-          continue;
-        assert(delta > 0);
-
-        const ElfRel<E> &r = rels[i];
-        memcpy(buf2, contents.data() + pos, r.r_offset - pos);
-        buf2 += r.r_offset - pos;
-        pos = r.r_offset + delta;
+      memcpy(buf, contents.data(), deltas[0].offset);
+
+      for (i64 i = 0; i < deltas.size(); i++) {
+        RelocDelta x = deltas[i];
+        i64 end = (i + 1 == deltas.size()) ? contents.size() : deltas[i + 1].offset;
+        i64 removed_bytes = get_removed_bytes(deltas, i);
+        memcpy(buf + x.offset - x.delta + removed_bytes,
+               contents.data() + x.offset + removed_bytes,
+               end - x.offset - removed_bytes);
       }
-      memcpy(buf2, contents.data() + pos, contents.size() - pos);
     }
   } else {
     copy_contents(ctx, buf);
@@ -357,7 +357,7 @@ bool InputSection<E>::record_undef_error(Context<E> &ctx, const ElfRel<E> &rel)
 template <typename E>
 MergeableSection<E>::MergeableSection(Context<E> &ctx, MergedSection<E> &parent,
                                       std::unique_ptr<InputSection<E>> &isec)
-  : parent(parent), section(std::move(isec)), p2align(section->p2align) {
+  : parent(parent), p2align(isec->p2align), section(std::move(isec)) {
   section->uncompress(ctx);
 
   std::scoped_lock lock(parent.mu);
@@ -396,8 +396,7 @@ template <typename E>
 void MergeableSection<E>::split_contents(Context<E> &ctx) {
   std::string_view data = section->contents;
   if (data.size() > UINT32_MAX)
-    Fatal(ctx) << *section
-               << ": mergeable section too large";
+    Fatal(ctx) << *section << ": mergeable section too large";
 
   i64 entsize = parent.shdr.sh_entsize;
 
diff --git a/src/linker-script.cc b/src/linker-script.cc
index 5ecb26aa56..cbb5be595d 100644
--- a/src/linker-script.cc
+++ b/src/linker-script.cc
@@ -6,7 +6,6 @@
 #include "mold.h"
 
 #include <cctype>
-#include <iomanip>
 
 namespace mold {
 
@@ -268,8 +267,8 @@ static bool read_label(std::span<std::string_view> &tok, std::string label) {
 template <typename E>
 std::span<std::string_view>
 Script<E>::read_version_script_commands(std::span<std::string_view> tok,
-                                     std::string_view ver_str, u16 ver_idx,
-                                     bool is_global, bool is_cpp) {
+                                        std::string_view ver_str, u16 ver_idx,
+                                        bool is_global, bool is_cpp) {
   while (!tok.empty() && tok[0] != "}") {
     if (read_label(tok, "global")) {
       is_global = true;
@@ -333,7 +332,7 @@ Script<E>::read_version_script(std::span<std::string_view> tok) {
     } else {
       ver_str = tok[0];
       ver_idx = next_ver++;
-      ctx.arg.version_definitions.push_back(std::string(tok[0]));
+      ctx.arg.version_definitions.emplace_back(tok[0]);
       tok = tok.subspan(1);
     }
 
@@ -359,8 +358,8 @@ void Script<E>::parse_version_script() {
 template <typename E>
 std::span<std::string_view>
 Script<E>::read_dynamic_list_commands(std::span<std::string_view> tok,
-                                   std::vector<DynamicPattern> &result,
-                                   bool is_cpp) {
+                                      std::vector<DynamicPattern> &result,
+                                      bool is_cpp) {
   while (!tok.empty() && tok[0] != "}") {
     if (tok[0] == "extern") {
       tok = tok.subspan(1);
diff --git a/src/lto-unix.cc b/src/lto-unix.cc
index 7d839c870a..8af3b9fd68 100644
--- a/src/lto-unix.cc
+++ b/src/lto-unix.cc
@@ -178,12 +178,12 @@ static PluginStatus add_input_file(const char *path) {
 
   MappedFile *mf = must_open_file(ctx, path);
 
-  ObjectFile<E> *file = new ObjectFile<E>(ctx, mf, "", false);
+  ObjectFile<E> *file = new ObjectFile<E>(ctx, mf, "");
   ctx.obj_pool.emplace_back(file);
   lto_objects<E>.push_back(file);
 
   file->priority = file_priority++;
-  file->is_alive = true;
+  file->is_reachable = true;
   file->parse(ctx);
   file->resolve_symbols(ctx);
   return LDPS_OK;
@@ -284,7 +284,7 @@ get_symbols(const void *handle, int nsyms, PluginSymbol *psyms, bool is_v2) {
   // If file is an archive member which was not chose to be included in
   // to the final result, we need to make the plugin to ignore all
   // symbols.
-  if (!file.is_alive) {
+  if (!file.is_reachable) {
     assert(!is_v2);
     for (int i = 0; i < nsyms; i++)
       psyms[i].resolution = LDPR_PREEMPTED_REG;
@@ -342,7 +342,7 @@ static void restart_process(Context<E> &ctx) {
     args.push_back(strdup(std::string(arg).c_str()));
 
   for (std::unique_ptr<ObjectFile<E>> &file : ctx.obj_pool)
-    if (file->is_lto_obj && !file->is_alive)
+    if (file->is_lto_obj && !file->is_reachable)
       args.push_back(strdup(("--:ignore-ir-file=" +
                              file->mf->get_identifier()).c_str()));
 
@@ -598,6 +598,12 @@ create_plugin_input_file(Context<E> &ctx, MappedFile *mf) {
 
 template <typename E>
 ObjectFile<E> *read_lto_object(Context<E> &ctx, MappedFile *mf) {
+  if (ctx.arg.plugin.empty())
+    Fatal(ctx) << mf->name << ": don't know how to handle this LTO object file "
+               << "because no -plugin option was given. Please make sure you "
+               << "added -flto not only for creating object files but also for "
+               << "creating the final executable.";
+
   load_lto_plugin(ctx);
 
   // V0 API's claim_file is not thread-safe.
@@ -606,12 +612,6 @@ ObjectFile<E> *read_lto_object(Context<E> &ctx, MappedFile *mf) {
   if (!is_gcc_linker_api_v1)
     lock.lock();
 
-  if (ctx.arg.plugin.empty())
-    Fatal(ctx) << mf->name << ": don't know how to handle this LTO object file "
-               << "because no -plugin option was given. Please make sure you "
-               << "added -flto not only for creating object files but also for "
-               << "creating the final executable.";
-
   // Create mold's object instance
   ObjectFile<E> *obj = new ObjectFile<E>;
   ctx.obj_pool.emplace_back(obj);
@@ -685,7 +685,7 @@ std::vector<ObjectFile<E> *> run_lto_plugin(Context<E> &ctx) {
   phase = 2;
 
   // Set `referenced_by_regular_obj` bit.
-  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
+  tbb::parallel_for_each(ctx.objs, [](ObjectFile<E> *file) {
     if (!file->is_lto_obj) {
       for (Symbol<E> *sym : file->get_global_syms()) {
         if (sym->file && !sym->file->is_dso &&
@@ -717,7 +717,7 @@ std::vector<ObjectFile<E> *> run_lto_plugin(Context<E> &ctx) {
   // given to the LTO backend. Such sections contains code and data for
   // peripherails (typically GPUs).
   for (ObjectFile<E> *file : ctx.objs) {
-    if (file->is_alive && !file->is_lto_obj && file->is_gcc_offload_obj) {
+    if (file->is_reachable && !file->is_lto_obj && file->is_gcc_offload_obj) {
       PluginInputFile pfile = create_plugin_input_file(ctx, file->mf);
       int claimed = false;
       claim_file_hook(&pfile, &claimed);
diff --git a/src/main.cc b/src/main.cc
index d661709a1e..94d0be7f29 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -3,14 +3,9 @@
 
 #include <cstring>
 #include <functional>
-#include <iomanip>
-#include <map>
-#include <regex>
-#include <signal.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <tbb/global_control.h>
-#include <tbb/parallel_for_each.h>
 #include <unordered_set>
 
 #ifdef _WIN32
@@ -23,6 +18,7 @@
 // see CMakeLists.txt:mold_instantiate_templates
 #ifdef MOLD_X86_64
 int main(int argc, char **argv) {
+  mold::set_mimalloc_options();
   return mold::mold_main<mold::X86_64>(argc, argv);
 }
 #endif
@@ -56,11 +52,11 @@ static ObjectFile<E> *new_object_file(Context<E> &ctx, ReaderContext &rctx,
 
   check_file_compatibility(ctx, rctx, mf);
 
-  bool in_lib = rctx.in_lib || (!archive_name.empty() && !rctx.whole_archive);
-
-  ObjectFile<E> *file = new ObjectFile<E>(ctx, mf, archive_name, in_lib);
+  ObjectFile<E> *file = new ObjectFile<E>(ctx, mf, archive_name);
   ctx.obj_pool.emplace_back(file);
   file->priority = ctx.file_priority++;
+  file->is_reachable =
+    !rctx.in_lib && (archive_name.empty() || rctx.whole_archive);
 
   // https://oneapi-src.github.io/oneTBB/main/reference/task_group_extensions.html
   rctx.tg->run([file, &ctx] { file->parse(ctx); });
@@ -81,8 +77,9 @@ static ObjectFile<E> *new_lto_obj(Context<E> &ctx, ReaderContext &rctx,
   ObjectFile<E> *file = read_lto_object(ctx, mf);
   file->priority = ctx.file_priority++;
   file->archive_name = archive_name;
-  file->is_in_lib = rctx.in_lib || (!archive_name.empty() && !rctx.whole_archive);
-  file->is_alive = !file->is_in_lib;
+  file->is_reachable =
+    !rctx.in_lib && (archive_name.empty() || rctx.whole_archive);
+
   if (ctx.arg.trace)
     Out(ctx) << "trace: " << *file;
   return file;
@@ -96,7 +93,7 @@ new_shared_file(Context<E> &ctx, ReaderContext &rctx, MappedFile *mf) {
   SharedFile<E> *file = new SharedFile<E>(ctx, mf);
   ctx.dso_pool.emplace_back(file);
   file->priority = ctx.file_priority++;
-  file->is_alive = !rctx.as_needed;
+  file->is_reachable = !rctx.as_needed;
 
   rctx.tg->run([file, &ctx] { file->parse(ctx); });
   if (ctx.arg.trace)
@@ -327,7 +324,7 @@ static void read_input_files(Context<E> &ctx, std::span<std::string> args) {
 template <typename E>
 static bool has_lto_obj(Context<E> &ctx) {
   for (ObjectFile<E> *file : ctx.objs)
-    if (file->is_alive && (file->is_lto_obj || file->is_gcc_offload_obj))
+    if (file->is_reachable && (file->is_lto_obj || file->is_gcc_offload_obj))
       return true;
   return false;
 }
@@ -436,8 +433,8 @@ int mold_main(int argc, char **argv) {
 
   // Now that we know which object files are to be included to the
   // final output, we can remove unnecessary files.
-  std::erase_if(ctx.objs, [](InputFile<E> *file) { return !file->is_alive; });
-  std::erase_if(ctx.dsos, [](InputFile<E> *file) { return !file->is_alive; });
+  std::erase_if(ctx.objs, [](InputFile<E> *file) { return !file->is_reachable; });
+  std::erase_if(ctx.dsos, [](InputFile<E> *file) { return !file->is_reachable; });
 
   // Parse .eh_frame section contents.
   parse_eh_frame_sections(ctx);
@@ -465,10 +462,18 @@ int mold_main(int argc, char **argv) {
   // Set is_imported and is_exported bits for each symbol.
   compute_import_export(ctx);
 
+  // Make sure that there's no duplicate symbol
+  if (!ctx.arg.allow_multiple_definition)
+    check_duplicate_symbols(ctx);
+
   // Set "address-taken" bits for input sections.
   if (ctx.arg.icf)
     compute_address_significance(ctx);
 
+  // Handle PPC64-specific .opd sections.
+  if constexpr (is_ppc64v1<E>)
+    ppc64v1_rewrite_opd(ctx);
+
   // Garbage-collect unreachable sections.
   if (ctx.arg.gc_sections)
     gc_sections(ctx);
@@ -480,19 +485,13 @@ int mold_main(int argc, char **argv) {
   // Create linker-synthesized sections such as .got or .plt.
   create_synthetic_sections(ctx);
 
-  // Make sure that there's no duplicate symbol
-  if (!ctx.arg.allow_multiple_definition)
-    check_duplicate_symbols(ctx);
-
+  // Handle --no-allow-shlib-undefined
   if (!ctx.arg.allow_shlib_undefined)
     check_shlib_undefined(ctx);
 
   // Warn if symbols with different types are defined under the same name.
   check_symbol_types(ctx);
 
-  if constexpr (is_ppc64v1<E>)
-    ppc64v1_rewrite_opd(ctx);
-
   // Bin input sections into output sections.
   create_output_sections(ctx);
 
@@ -669,6 +668,10 @@ int mold_main(int argc, char **argv) {
 
   // At this point, both memory and file layouts are fixed.
 
+  // Gather thunk symbols and attach them to themselves.
+  if constexpr (needs_thunk<E>)
+    gather_thunk_addresses(ctx);
+
   t_before_copy.stop();
 
   // Create an output file
@@ -686,13 +689,7 @@ int mold_main(int argc, char **argv) {
 
   // Dynamic linker works better with sorted .rela.dyn section,
   // so we sort them.
-  ctx.reldyn->sort(ctx);
-
-  // .note.gnu.build-id section contains a cryptographic hash of the
-  // entire output file. Now that we wrote everything except build-id,
-  // we can compute it.
-  if (ctx.buildid)
-    write_build_id(ctx);
+  sort_reldyn(ctx);
 
   // .gdb_index's contents cannot be constructed before applying
   // relocations to other debug sections. We have relocated debug
@@ -700,6 +697,12 @@ int mold_main(int argc, char **argv) {
   if (ctx.gdb_index && ctx.arg.separate_debug_file.empty())
     write_gdb_index(ctx);
 
+  // .note.gnu.build-id section contains a cryptographic hash of the
+  // entire output file. Now that we wrote everything except build-id,
+  // we can compute it.
+  if (ctx.buildid)
+    write_build_id(ctx);
+
   if (!ctx.arg.separate_debug_file.empty())
     write_gnu_debuglink(ctx);
 
diff --git a/src/mapfile.cc b/src/mapfile.cc
index 877361865d..d18e2f06fb 100644
--- a/src/mapfile.cc
+++ b/src/mapfile.cc
@@ -4,8 +4,8 @@
 #include <iomanip>
 #include <ios>
 #include <sstream>
+#include <tbb/concurrent_hash_map.h>
 #include <tbb/parallel_for_each.h>
-#include <unordered_map>
 
 namespace mold {
 
diff --git a/src/mold.h b/src/mold.h
index a61259cc5b..24a915c3d1 100644
--- a/src/mold.h
+++ b/src/mold.h
@@ -3,11 +3,8 @@
 #include "../lib/common.h"
 #include "elf.h"
 
-#include <atomic>
-#include <bitset>
 #include <cassert>
 #include <cstdint>
-#include <fstream>
 #include <functional>
 #include <iostream>
 #include <map>
@@ -15,13 +12,10 @@
 #include <mutex>
 #include <optional>
 #include <span>
-#include <sstream>
 #include <string>
 #include <string_view>
 #include <tbb/concurrent_hash_map.h>
-#include <tbb/concurrent_unordered_map.h>
 #include <tbb/concurrent_vector.h>
-#include <tbb/enumerable_thread_specific.h>
 #include <tbb/spin_mutex.h>
 #include <tbb/task_group.h>
 #include <type_traits>
@@ -89,12 +83,11 @@ struct SymbolAux {
   i32 plt_idx = -1;
   i32 pltgot_idx = -1;
   i32 dynsym_idx = -1;
+  i32 opd_idx = -1;
   u32 djb_hash = 0;
-};
 
-template <>
-struct SymbolAux<PPC64V1> : SymbolAux<X86_64> {
-  i32 opd_idx = -1;
+  // For range extension thunks
+  std::vector<u64> thunk_addrs;
 };
 
 //
@@ -113,25 +106,57 @@ class Thunk<E> {
   i64 size() const { return E::thunk_hdr_size + symbols.size() * E::thunk_size; }
   void copy_buf(Context<E> &ctx);
 
-  u64 get_addr(i64 idx) const {
-    return output_section.shdr.sh_addr + offset + E::thunk_hdr_size +
-           idx * E::thunk_size;
+  u64 get_addr() const {
+    return output_section.shdr.sh_addr + offset;
   }
 
-  static constexpr i64 alignment = 16;
+  u64 get_addr(i64 i) const {
+    return get_addr() + E::thunk_hdr_size + E::thunk_size * i;
+  }
 
   OutputSection<E> &output_section;
   i64 offset;
-  std::mutex mu;
   std::vector<Symbol<E> *> symbols;
+  std::string name;
 };
 
-struct ThunkRef {
-  static constexpr i64 MAX_SYM_IDX = (1 << 17) - 1;
+template <needs_thunk E>
+static consteval i64 get_branch_distance() {
+  // ARM64's branch has 26 bits immediate. The immediate is padded with
+  // implicit two-bit zeros because all instructions are 4 bytes aligned
+  // and therefore the least two bits are always zero. So the branch
+  // operand is effectively 28 bits long. That means the branch range is
+  // [-2^27, 2^27) or PC ± 128 MiB.
+  if (is_arm64<E>)
+    return 1 << 27;
+
+  // ARM32's Thumb branch has 24 bits immediate, and the instructions are
+  // aligned to 2, so it's effectively 25 bits. It's [-2^24, 2^24) or PC ±
+  // 16 MiB.
+  //
+  // ARM32's non-Thumb branches have twice longer range than its Thumb
+  // counterparts, but we conservatively use the Thumb's limitation.
+  if (is_arm32<E>)
+    return 1 << 24;
 
-  i32 thunk_idx : 14 = -1;
-  i32 sym_idx : 18 = -1;
-};
+  // PPC's branch has 24 bits immediate, and the instructions are aligned
+  // to 4, therefore the reach is [-2^25, 2^25) or PC ± 32 MiB.
+  assert(is_ppc<E>);
+  return 1 << 25;
+}
+
+// The maximum distance of branch instructions used for function calls.
+//
+// The exact origin for computing a destination varies slightly depending
+// on the target architecture. For example, ARM32's B instruction jumps to
+// the branch's address + immediate + 4 (i.e., B with offset 0 jumps to
+// the next instruction), while RISC-V has no such implicit bias. Here, we
+// subtract 32 as a safety margin that is large enough for all targets.
+template <needs_thunk E>
+static constexpr i64 branch_distance = get_branch_distance<E>() - 32;
+
+template <needs_thunk E>
+void gather_thunk_addresses(Context<E> &ctx);
 
 //
 // input-sections.cc
@@ -245,14 +270,29 @@ struct FdeRecord {
 template <typename E>
 struct InputSectionExtras {};
 
-template <needs_thunk E>
+template <typename E> requires is_arm32<E>
 struct InputSectionExtras<E> {
-  std::vector<ThunkRef> thunk_refs;
+  InputSection<E> *exidx = nullptr;
+};
+
+struct RelocDelta {
+  u64 offset : 38;
+  i64 delta : 26;
 };
 
+// RISC-V and LoongArch support code-shrinking linker relaxation.
+//
+// r_deltas is used to manage the locations where instructions are removed
+// from a section. r_deltas is sorted by offset. Each RelocDelta indicates
+// that the contents at and after `offset` and up to the next RelocDelta
+// offset need to be shifted towards the beginning of the section by
+// `delta` bytes when copying section contents to the output buffer.
+//
+// Since code-shrinking relaxation never bloats section contents, `delta`
+// increases monotonically within the vector as well.
 template <typename E> requires is_riscv<E> || is_loongarch<E>
 struct InputSectionExtras<E> {
-  std::vector<i32> r_deltas;
+  std::vector<RelocDelta> r_deltas;
 };
 
 // InputSection represents a section in an input object file.
@@ -300,7 +340,7 @@ class __attribute__((aligned(4))) InputSection {
   bool uncompressed = false;
 
   // For COMDAT de-duplication and garbage collection
-  std::atomic_bool is_alive = true;
+  Atomic<bool> is_alive = true;
   u8 p2align = 0;
 
   // For ICF
@@ -335,8 +375,6 @@ class __attribute__((aligned(4))) InputSection {
   void apply_toc_rel(Context<E> &ctx, Symbol<E> &sym, const ElfRel<E> &rel,
                      u8 *loc, u64 S, i64 A, u64 P, ElfRel<E> **dynrel);
 
-  u64 get_thunk_addr(i64 idx);
-
   std::optional<u64> get_tombstone(Symbol<E> &sym, SectionFragment<E> *frag);
 };
 
@@ -408,7 +446,7 @@ class __attribute__((aligned(4))) Chunk {
   virtual i64 get_reldyn_size(Context<E> &ctx) const { return 0; }
   virtual void construct_relr(Context<E> &ctx) {}
   virtual void copy_buf(Context<E> &ctx) {}
-  virtual void write_to(Context<E> &ctx, u8 *buf, ElfRel<E> *rel) { unreachable(); }
+  virtual void write_to(Context<E> &ctx, u8 *buf) { unreachable(); }
   virtual void update_shdr(Context<E> &ctx) {}
 
   std::string_view name;
@@ -533,7 +571,7 @@ class OutputSection : public Chunk<E> {
   i64 get_reldyn_size(Context<E> &ctx) const override;
   void construct_relr(Context<E> &ctx) override;
   void copy_buf(Context<E> &ctx) override;
-  void write_to(Context<E> &ctx, u8 *buf, ElfRel<E> *rel) override;
+  void write_to(Context<E> &ctx, u8 *buf) override;
 
   void compute_symtab_size(Context<E> &ctx) override;
   void populate_symtab(Context<E> &ctx) override;
@@ -546,6 +584,9 @@ class OutputSection : public Chunk<E> {
   std::unique_ptr<RelocSection<E>> reloc_sec;
   std::vector<AbsRel<E>> abs_rels;
   Atomic<u32> sh_flags;
+
+  // Used only by create_output_sections()
+  std::vector<std::vector<InputSection<E> *>> members_vec;
 };
 
 template <typename E>
@@ -559,9 +600,9 @@ class GotSection : public Chunk<E> {
     this->shdr.sh_addralign = sizeof(Word<E>);
 
     // We always create a .got so that _GLOBAL_OFFSET_TABLE_ has
-    // something to point to. s390x psABI define GOT[1] as a
-    // reserved slot, so we allocate one more for them.
-    this->shdr.sh_size = (is_s390x<E> ? 2 : 1) * sizeof(Word<E>);
+    // something to point to. s390x psABI define GOT[1] and GOT[2]
+    // as reserved slots, so we allocate two more for them.
+    this->shdr.sh_size = (is_s390x<E> ? 3 : 1) * sizeof(Word<E>);
   }
 
   void add_got_symbol(Context<E> &ctx, Symbol<E> *sym);
@@ -677,7 +718,6 @@ class RelDynSection : public Chunk<E> {
   }
 
   void update_shdr(Context<E> &ctx) override;
-  void sort(Context<E> &ctx);
 };
 
 template <typename E>
@@ -737,8 +777,6 @@ class DynstrSection : public Chunk<E> {
   i64 find_string(std::string_view str);
   void copy_buf(Context<E> &ctx) override;
 
-  i64 dynsym_offset = -1;
-
 private:
   std::unordered_map<std::string_view, i64> strings;
 };
@@ -810,6 +848,7 @@ class DynsymSection : public Chunk<E> {
   void copy_buf(Context<E> &ctx) override;
 
   std::vector<Symbol<E> *> symbols;
+  i64 dynstr_offset = -1;
 };
 
 template <typename E>
@@ -819,12 +858,18 @@ class HashSection : public Chunk<E> {
     this->name = ".hash";
     this->shdr.sh_type = SHT_HASH;
     this->shdr.sh_flags = SHF_ALLOC;
-    this->shdr.sh_entsize = 4;
-    this->shdr.sh_addralign = 4;
+    this->shdr.sh_entsize = sizeof(Entry);
+    this->shdr.sh_addralign = sizeof(Entry);
   }
 
   void update_shdr(Context<E> &ctx) override;
   void copy_buf(Context<E> &ctx) override;
+
+private:
+  // Even though u32 should suffice for all targets, s390x uses u64.
+  // It looks like a spec bug, but we need to follow suit for the
+  // sake of binary compatibility.
+  using Entry = std::conditional_t<is_s390x<E>, U64<E>, U32<E>>;
 };
 
 template <typename E>
@@ -861,7 +906,7 @@ class MergedSection : public Chunk<E> {
   void resolve(Context<E> &ctx);
   void compute_section_size(Context<E> &ctx) override;
   void copy_buf(Context<E> &ctx) override;
-  void write_to(Context<E> &ctx, u8 *buf, ElfRel<E> *rel) override;
+  void write_to(Context<E> &ctx, u8 *buf) override;
   void print_stats(Context<E> &ctx);
 
   std::vector<MergeableSection<E> *> members;
@@ -1039,9 +1084,22 @@ class NotePropertySection : public Chunk<E> {
   void copy_buf(Context<E> &ctx) override;
 
 private:
-  static constexpr i64 ENTRY_SIZE = E::is_64 ? 16 : 12;
+  struct Entry64 {
+    U32<E> type;
+    U32<E> size;
+    U32<E> flags;
+    u8 padding[4];
+  };
 
-  std::map<u32, u32> properties;
+  struct Entry32 {
+    U32<E> type;
+    U32<E> size;
+    U32<E> flags;
+  };
+
+  using Entry = std::conditional_t<E::is_64, Entry64, Entry32>;
+
+  std::vector<Entry> contents;
 };
 
 template <typename E>
@@ -1251,12 +1309,12 @@ class MergeableSection {
 
   MergedSection<E> &parent;
   std::vector<SectionFragment<E> *> fragments;
+  u8 p2align = 0;
 
 private:
   std::unique_ptr<InputSection<E>> section;
   std::vector<u32> frag_offsets;
   std::vector<u32> hashes;
-  u8 p2align = 0;
 };
 
 // InputFile is the base class of ObjectFile and SharedFile.
@@ -1276,8 +1334,7 @@ class InputFile {
 
   std::string_view get_string(Context<E> &ctx, const ElfShdr<E> &shdr);
   std::string_view get_string(Context<E> &ctx, i64 idx);
-
-  ElfEhdr<E> &get_ehdr() { return *(ElfEhdr<E> *)mf->data; }
+  u32 get_eflags() { return ((ElfEhdr<E> *)mf->data)->e_flags; }
   std::span<ElfPhdr<E>> get_phdrs();
 
   ElfShdr<E> *find_section(i64 type);
@@ -1300,7 +1357,7 @@ class InputFile {
   std::string filename;
   bool is_dso = false; // a shared object
   i64 priority;
-  Atomic<bool> is_alive = false;
+  Atomic<bool> is_reachable = false;
   std::string_view shstrtab;
   std::string_view symbol_strtab;
 
@@ -1343,8 +1400,8 @@ class ObjectFile : public InputFile<E> {
 public:
   ObjectFile() = default;
 
-  ObjectFile(Context<E> &ctx, MappedFile *mf,
-             std::string archive_name, bool is_in_lib);
+  ObjectFile(Context<E> &ctx, MappedFile *mf, std::string archive_name)
+    : InputFile<E>(ctx, mf), archive_name(archive_name) {}
 
   void parse(Context<E> &ctx);
   void initialize_symbols(Context<E> &ctx);
@@ -1366,11 +1423,10 @@ class ObjectFile : public InputFile<E> {
   std::string archive_name;
   std::vector<std::unique_ptr<InputSection<E>>> sections;
   std::vector<std::unique_ptr<MergeableSection<E>>> mergeable_sections;
-  bool is_in_lib = false;
   std::vector<ElfShdr<E>> elf_sections2;
   std::vector<CieRecord<E>> cies;
   std::vector<FdeRecord<E>> fdes;
-  BitVector has_symver;
+  std::vector<bool> has_symver;
   std::vector<ComdatGroupRef<E>> comdat_groups;
   std::vector<InputSection<E> *> eh_frame_sections;
   bool exclude_libs = false;
@@ -1532,11 +1588,20 @@ void lto_cleanup(Context<E> &ctx);
 // shrink-sections.cc
 //
 
+inline i64 get_removed_bytes(std::span<RelocDelta> deltas, i64 i) {
+  if (i == 0)
+    return deltas[i].delta;
+  return deltas[i].delta - deltas[i - 1].delta;
+}
+
 template <typename E>
 void shrink_sections(Context<E> &ctx);
 
 template <typename E>
-void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc);
+void shrink_section(Context<E> &ctx, InputSection<E> &isec);
+
+template <typename E>
+i64 get_r_delta(InputSection<E> &isec, u64 offset);
 
 template <typename E>
 i64 compute_distance(Context<E> &ctx, Symbol<E> &sym,
@@ -1638,6 +1703,7 @@ template <typename E> void compute_section_headers(Context<E> &);
 template <typename E> i64 set_osec_offsets(Context<E> &);
 template <typename E> void fix_synthetic_symbols(Context<E> &);
 template <typename E> void compress_debug_sections(Context<E> &);
+template <typename E> void sort_reldyn(Context<E> &);
 template <typename E> void write_build_id(Context<E> &);
 template <typename E> void write_gnu_debuglink(Context<E> &);
 template <typename E> void write_separate_debug_file(Context<E> &ctx);
@@ -1820,6 +1886,11 @@ struct SectionOrder {
 template <typename E>
 struct ContextExtras {};
 
+template <is_x86 E>
+struct ContextExtras<E> {
+  NotePropertySection<E> *note_property = nullptr;
+};
+
 template <>
 struct ContextExtras<ARM32> {
   Arm32ExidxSection *exidx = nullptr;
@@ -2053,7 +2124,7 @@ struct Context {
   // Output buffer
   std::unique_ptr<OutputFile<E>> output_file;
   u8 *buf = nullptr;
-  bool overwrite_output_file = true;
+  bool overwrite_output_file = false;
 
   std::vector<Chunk<E> *> chunks;
   Atomic<bool> needs_tlsld = false;
@@ -2097,7 +2168,6 @@ struct Context {
   VerdefSection<E> *verdef = nullptr;
   BuildIdSection<E> *buildid = nullptr;
   NotePackageSection<E> *note_package = nullptr;
-  NotePropertySection<E> *note_property = nullptr;
   GdbIndexSection<E> *gdb_index = nullptr;
   RelroPaddingSection<E> *relro_padding = nullptr;
   MergedSection<E> *comment = nullptr;
@@ -2179,24 +2249,14 @@ enum {
   NEEDS_PPC_OPD   = 1 << 7, // for PPCv1
 };
 
-// A struct to hold target-dependent symbol members.
-template <typename E>
-struct SymbolExtras {};
-
-template <needs_thunk E>
-struct SymbolExtras<E> {
-  // For range extension thunks
-  i16 thunk_idx = -1;
-  i16 thunk_sym_idx = -1;
-};
-
 // Flags for Symbol<E>::get_addr()
 enum {
   NO_PLT = 1 << 0, // Request an address other than .plt
   NO_OPD = 1 << 1, // Request an address other than .opd (PPC64V1 only)
 };
 
-// Symbol class represents a defined symbol.
+// Symbol class represents a symbol. For each unique symbol name, we
+// create one instance of Symbol.
 //
 // A symbol has not only one but several different addresses if it
 // has PLT or GOT entries. This class provides various functions to
@@ -2249,6 +2309,8 @@ class Symbol {
   u32 get_djb_hash(Context<E> &ctx) const;
   void set_djb_hash(Context<E> &ctx, u32 hash);
 
+  u64 get_thunk_addr(Context<E> &ctx, u64 P) const requires needs_thunk<E>;
+
   bool is_absolute() const;
   bool is_relative() const { return !is_absolute(); }
   bool is_local(Context<E> &ctx) const;
@@ -2445,14 +2507,8 @@ class Symbol {
   // opposed to IR object).
   bool referenced_by_regular_obj : 1 = false;
 
-  // For `-z rewrite-endbr`
-  bool address_taken : 1 = false;
-
   // If true, we try to dmenagle the sybmol when printing.
   bool demangle : 1 = false;
-
-  // Target-dependent extra members.
-  [[no_unique_address]] SymbolExtras<E> extra;
 };
 
 template <typename E>
@@ -2565,16 +2621,6 @@ InputSection<E>::get_fragment(Context<E> &ctx, const ElfRel<E> &rel) {
   return {p.first, p.second + get_addend(*this, rel)};
 }
 
-template <typename E>
-u64 InputSection<E>::get_thunk_addr(i64 idx) {
-  if constexpr (needs_thunk<E>) {
-    ThunkRef ref = extra.thunk_refs[idx];
-    assert(ref.thunk_idx != -1);
-    return output_section->thunks[ref.thunk_idx]->get_addr(ref.sym_idx);
-  }
-  unreachable();
-}
-
 // Input object files may contain duplicate code for inline functions
 // and such. Linkers de-duplicate them at link-time. However, linkers
 // generaly don't remove debug info for de-duplicated functions because
@@ -2965,6 +3011,18 @@ inline void Symbol<E>::set_djb_hash(Context<E> &ctx, u32 hash) {
   ctx.symbol_aux[aux_idx].djb_hash = hash;
 }
 
+template <typename E>
+u64
+Symbol<E>::get_thunk_addr(Context<E> &ctx, u64 P) const requires needs_thunk<E> {
+  std::span<u64> vec = ctx.symbol_aux[aux_idx].thunk_addrs;
+  u64 lo = (P < branch_distance<E>) ? 0 : P - branch_distance<E>;
+  u64 val = *std::lower_bound(vec.begin(), vec.end(), lo);
+  i64 disp = val - P;
+  if (disp < -branch_distance<E> || branch_distance<E> <= disp)
+    Fatal(ctx) << "range extension thunk out of range: " << *this;
+  return val;
+}
+
 template <typename E>
 inline bool Symbol<E>::has_plt(Context<E> &ctx) const {
   return get_plt_idx(ctx) != -1 || get_pltgot_idx(ctx) != -1;
@@ -3137,7 +3195,7 @@ std::string_view save_string(Context<E> &ctx, const std::string &str) {
   u8 *buf = new u8[str.size() + 1];
   memcpy(buf, str.data(), str.size());
   buf[str.size()] = '\0';
-  ctx.string_pool.push_back(std::unique_ptr<u8[]>(buf));
+  ctx.string_pool.emplace_back(buf);
   return {(char *)buf, str.size()};
 }
 
diff --git a/src/output-chunks.cc b/src/output-chunks.cc
index 353272e573..ef1a4c3488 100644
--- a/src/output-chunks.cc
+++ b/src/output-chunks.cc
@@ -1,5 +1,4 @@
 #include "mold.h"
-#include "config.h"
 
 #include <cctype>
 #include <set>
@@ -408,47 +407,6 @@ void RelDynSection<E>::update_shdr(Context<E> &ctx) {
   this->shdr.sh_link = ctx.dynsym->shndx;
 }
 
-template <typename E>
-void RelDynSection<E>::sort(Context<E> &ctx) {
-  Timer t(ctx, "sort_dynamic_relocs");
-
-  ElfRel<E> *begin = (ElfRel<E> *)(ctx.buf + this->shdr.sh_offset);
-  ElfRel<E> *end = begin + this->shdr.sh_size / sizeof(ElfRel<E>);
-
-  auto get_rank = [](u32 r_type) {
-    if (r_type == E::R_RELATIVE)
-      return 0;
-    if constexpr (supports_ifunc<E>)
-      if (r_type == E::R_IRELATIVE)
-        return 2;
-    return 1;
-  };
-
-  // This is the reason why we sort dynamic relocations. Quote from
-  // https://www.airs.com/blog/archives/186:
-  //
-  //   The dynamic linker in glibc uses a one element cache when processing
-  //   relocs: if a relocation refers to the same symbol as the previous
-  //   relocation, then the dynamic linker reuses the value rather than
-  //   looking up the symbol again. Thus the dynamic linker gets the best
-  //   results if the dynamic relocations are sorted so that all dynamic
-  //   relocations for a given dynamic symbol are adjacent.
-  //
-  //   Other than that, the linker sorts together all relative relocations,
-  //   which don't have symbols. Two relative relocations, or two relocations
-  //   against the same symbol, are sorted by the address in the output
-  //   file. This tends to optimize paging and caching when there are two
-  //   references from the same page.
-  //
-  // We group IFUNC relocations at the end of .rel.dyn because we want to
-  // apply all the other relocations before running user-supplied ifunc
-  // resolver functions.
-  tbb::parallel_sort(begin, end, [&](const ElfRel<E> &a, const ElfRel<E> &b) {
-    return std::tuple(get_rank(a.r_type), a.r_sym, a.r_offset) <
-           std::tuple(get_rank(b.r_type), b.r_sym, b.r_offset);
-  });
-}
-
 template <typename E>
 void RelrDynSection<E>::update_shdr(Context<E> &ctx) {
   i64 n = 0;
@@ -535,11 +493,10 @@ void ShstrtabSection<E>::copy_buf(Context<E> &ctx) {
 
 template <typename E>
 i64 DynstrSection<E>::add_string(std::string_view str) {
-  if (this->shdr.sh_size == 0)
+  if (this->shdr.sh_size == 0) {
+    strings.insert({"", 0});
     this->shdr.sh_size = 1;
-
-  if (str.empty())
-    return 0;
+  }
 
   auto [it, inserted] = strings.insert({str, this->shdr.sh_size});
   if (inserted)
@@ -549,9 +506,6 @@ i64 DynstrSection<E>::add_string(std::string_view str) {
 
 template <typename E>
 i64 DynstrSection<E>::find_string(std::string_view str) {
-  if (str.empty())
-    return 0;
-
   auto it = strings.find(str);
   assert(it != strings.end());
   return it->second;
@@ -560,12 +514,11 @@ i64 DynstrSection<E>::find_string(std::string_view str) {
 template <typename E>
 void DynstrSection<E>::copy_buf(Context<E> &ctx) {
   u8 *base = ctx.buf + this->shdr.sh_offset;
-  base[0] = '\0';
 
   for (std::pair<std::string_view, i64> p : strings)
     write_string(base + p.second, p.first);
 
-  i64 off = dynsym_offset;
+  i64 off = ctx.dynsym->dynstr_offset;
   for (Symbol<E> *sym : ctx.dynsym->symbols)
     if (sym)
       off += write_string(base + off, sym->name());
@@ -665,14 +618,15 @@ void SymtabSection<E>::copy_buf(Context<E> &ctx) {
 // is set in the dynamic section.
 //
 // This function returns true if DT_AARCH64_VARIANT_PCS needs to be set.
-static bool contains_variant_pcs(Context<ARM64> &ctx) {
-  for (Symbol<ARM64> *sym : ctx.plt->symbols)
+template <is_arm64 E>
+static bool contains_variant_pcs(Context<E> &ctx) {
+  for (Symbol<E> *sym : ctx.plt->symbols)
     if (sym->esym().arm64_variant_pcs)
       return true;
   return false;
 }
 
-// RISC-V has the same feature but with different names.
+// RISC-V has the same feature but with a different name.
 template <is_riscv E>
 static bool contains_variant_cc(Context<E> &ctx) {
   for (Symbol<E> *sym : ctx.plt->symbols)
@@ -842,10 +796,10 @@ static std::vector<Word<E>> create_dynamic_section(Context<E> &ctx) {
     define(DT_PPC_GOT, ctx.gotplt->shdr.sh_addr);
 
   if constexpr (is_ppc64<E>) {
-    // PPC64_GLINK is defined by the psABI to refer 32 bytes before
+    // PPC64_GLINK is defined by the psABI to refer to 32 bytes before
     // the first PLT entry. I don't know why it's 32 bytes off, but
     // it's what it is.
-    define(DT_PPC64_GLINK, ctx.plt->shdr.sh_addr + E::plt_hdr_size - 32);
+    define(DT_PPC64_GLINK, ctx.plt->shdr.sh_addr + to_plt_offset<E>(0) - 32);
   }
 
   // GDB needs a DT_DEBUG entry in an executable to store a word-size
@@ -857,7 +811,6 @@ static std::vector<Word<E>> create_dynamic_section(Context<E> &ctx) {
 
   for (i64 i = 0; i < ctx.arg.spare_dynamic_tags; i++)
     define(DT_NULL, 0);
-
   return vec;
 }
 
@@ -891,22 +844,15 @@ static std::vector<std::span<T>> split(std::vector<T> &input, i64 unit) {
   return vec;
 }
 
-
 // Assign offsets to OutputSection members
 template <typename E>
 void OutputSection<E>::compute_section_size(Context<E> &ctx) {
   ElfShdr<E> &shdr = this->shdr;
 
-  // On most RISC systems, we need to create so-called "range extension
-  // thunks" to extend branch instructions reach, as their jump
-  // instructions' reach is limited. create_range_extension_thunks()
-  // computes the size of the section while inserting thunks.
-  if constexpr (needs_thunk<E>) {
-    if ((shdr.sh_flags & SHF_EXECINSTR) && !ctx.arg.relocatable) {
-      create_range_extension_thunks(ctx);
-      return;
-    }
-  }
+  // Text sections must to be handled by create_range_extension_thunks()
+  // if they may need range extension thunks.
+  assert(!needs_thunk<E> || !(shdr.sh_flags & SHF_EXECINSTR) ||
+         ctx.arg.relocatable);
 
   // Since one output section may contain millions of input sections,
   // we first split input sections into groups and assign offsets to
@@ -914,61 +860,101 @@ void OutputSection<E>::compute_section_size(Context<E> &ctx) {
   struct Group {
     std::span<InputSection<E> *> members;
     i64 size = 0;
-    i64 p2align = 0;
     i64 offset = 0;
   };
 
-  std::span<InputSection<E> *> mem = members;
   std::vector<Group> groups;
   constexpr i64 group_size = 10000;
 
-  while (!mem.empty()) {
-    i64 sz = std::min<i64>(group_size, mem.size());
-    groups.push_back({mem.subspan(0, sz)});
-    mem = mem.subspan(sz);
+  for (std::span<InputSection<E> *> m = members; !m.empty();) {
+    i64 sz = std::min<i64>(group_size, m.size());
+    groups.push_back({m.subspan(0, sz)});
+    m = m.subspan(sz);
   }
 
   tbb::parallel_for_each(groups, [](Group &group) {
-    for (InputSection<E> *isec : group.members) {
-      group.size = align_to(group.size, 1 << isec->p2align) + isec->sh_size;
-      group.p2align = std::max<i64>(group.p2align, isec->p2align);
-    }
+    i64 off = 0;
+    for (InputSection<E> *isec : group.members)
+      off = align_to(off, 1 << isec->p2align) + isec->sh_size;
+    group.size = off;
   });
 
-  shdr.sh_size = 0;
-
+  i64 off = 0;
   for (i64 i = 0; i < groups.size(); i++) {
-    shdr.sh_size = align_to(shdr.sh_size, 1 << groups[i].p2align);
-    groups[i].offset = shdr.sh_size;
-    shdr.sh_size += groups[i].size;
-    shdr.sh_addralign = std::max<u32>(shdr.sh_addralign, 1 << groups[i].p2align);
+    off = align_to(off, shdr.sh_addralign);
+    groups[i].offset = off;
+    off += groups[i].size;
   }
 
+  shdr.sh_size = off;
+
   // Assign offsets to input sections.
   tbb::parallel_for_each(groups, [](Group &group) {
-    i64 offset = group.offset;
+    i64 off = group.offset;
     for (InputSection<E> *isec : group.members) {
-      offset = align_to(offset, 1 << isec->p2align);
-      isec->offset = offset;
-      offset += isec->sh_size;
+      off = align_to(off, 1 << isec->p2align);
+      isec->offset = off;
+      off += isec->sh_size;
     }
   });
 }
 
 template <typename E>
 void OutputSection<E>::copy_buf(Context<E> &ctx) {
-  if (this->shdr.sh_type != SHT_NOBITS) {
-    ElfRel<E> *rel = nullptr;
-    if (ctx.reldyn)
-      rel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
-                          this->reldyn_offset);
+  if (this->shdr.sh_type == SHT_NOBITS)
+    return;
+
+  // Copy section contents
+  u8 *buf = ctx.buf + this->shdr.sh_offset;
+  write_to(ctx, buf);
+
+  // Emit dynamic relocations
+  if (!ctx.reldyn)
+    return;
+
+  ElfRel<E> *rel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
+                                 this->reldyn_offset);
+
+  for (AbsRel<E> &r : abs_rels) {
+    Symbol<E> &sym = *r.sym;
+    u8 *loc = buf + r.isec->offset + r.offset;
+    u64 S = sym.get_addr(ctx);
+    u64 A = r.addend;
+    u64 P = this->shdr.sh_addr + r.isec->offset + r.offset;
+
+    if constexpr (is_riscv<E> || is_loongarch<E>) {
+      i64 delta = get_r_delta(*r.isec, r.offset);
+      loc -= delta;
+      P -= delta;
+    }
+
+    auto dynrel = [&](i64 ty, i64 idx, u64 val) {
+      *rel++ = ElfRel<E>(P, ty, idx, val);
+      if (ctx.arg.apply_dynamic_relocs)
+        *(Word<E> *)loc = val;
+    };
 
-    write_to(ctx, ctx.buf + this->shdr.sh_offset, rel);
+    switch (r.kind) {
+    case ABS_REL_NONE:
+    case ABS_REL_RELR:
+      *(Word<E> *)loc = S + A;
+      break;
+    case ABS_REL_BASEREL:
+      dynrel(E::R_RELATIVE, 0, S + A);
+      break;
+    case ABS_REL_IFUNC:
+      if constexpr (supports_ifunc<E>)
+        dynrel(E::R_IRELATIVE, 0, sym.get_addr(ctx, NO_PLT) + A);
+      break;
+    case ABS_REL_DYNREL:
+      dynrel(E::R_ABS, sym.get_dynsym_idx(ctx), A);
+      break;
+    }
   }
 }
 
 template <typename E>
-void OutputSection<E>::write_to(Context<E> &ctx, u8 *buf, ElfRel<E> *rel) {
+void OutputSection<E>::write_to(Context<E> &ctx, u8 *buf) {
   // Copy section contents to an output file.
   tbb::parallel_for((i64)0, (i64)members.size(), [&](i64 i) {
     InputSection<E> &isec = *members[i];
@@ -1001,40 +987,6 @@ void OutputSection<E>::write_to(Context<E> &ctx, u8 *buf, ElfRel<E> *rel) {
       thunk->copy_buf(ctx);
     });
   }
-
-  // Emit dynamic relocations.
-  for (AbsRel<E> &r : abs_rels) {
-    Word<E> *loc = (Word<E> *)(buf + r.isec->offset + r.offset);
-    u64 addr = this->shdr.sh_addr + r.isec->offset + r.offset;
-    Symbol<E> &sym = *r.sym;
-
-    switch (r.kind) {
-    case ABS_REL_NONE:
-    case ABS_REL_RELR:
-      *loc = sym.get_addr(ctx) + r.addend;
-      break;
-    case ABS_REL_BASEREL: {
-      u64 val = sym.get_addr(ctx) + r.addend;
-      *rel++ = ElfRel<E>(addr, E::R_RELATIVE, 0, val);
-      if (ctx.arg.apply_dynamic_relocs)
-        *loc = val;
-      break;
-    }
-    case ABS_REL_IFUNC:
-      if constexpr (supports_ifunc<E>) {
-        u64 val = sym.get_addr(ctx, NO_PLT) + r.addend;
-        *rel++ = ElfRel<E>(addr, E::R_IRELATIVE, 0, val);
-        if (ctx.arg.apply_dynamic_relocs)
-          *loc = val;
-      }
-      break;
-    case ABS_REL_DYNREL:
-      *rel++ = ElfRel<E>(addr, E::R_ABS, sym.get_dynsym_idx(ctx), r.addend);
-      if (ctx.arg.apply_dynamic_relocs)
-        *loc = r.addend;
-      break;
-    }
-  }
 }
 
 // .relr.dyn contains base relocations encoded in a space-efficient form.
@@ -1056,10 +1008,9 @@ void OutputSection<E>::write_to(Context<E> &ctx, u8 *buf, ElfRel<E> *rel) {
 // the .rel.dyn section). A bitmap has LSB 1.
 template <typename E>
 static std::vector<u64> encode_relr(std::span<u64> pos) {
-  for (i64 i = 0; i < pos.size(); i++) {
-    assert(pos[i] % sizeof(Word<E>) == 0);
-    assert(i == 0 || pos[i - 1] < pos[i]);
-  }
+  assert(std::all_of(pos.begin(), pos.end(),
+                     [](u64 x) { return x % sizeof(Word<E>) == 0; }));
+  assert(std::is_sorted(pos.begin(), pos.end()));
 
   std::vector<u64> vec;
   i64 num_bits = E::is_64 ? 63 : 31;
@@ -1149,7 +1100,7 @@ void OutputSection<E>::scan_abs_relocations(Context<E> &ctx) {
 
   // If --pack-dyn-relocs=relr is enabled, base relocations are put into
   // .relr.dyn.
-  if (ctx.arg.pack_dyn_relocs_relr)
+  if (ctx.arg.pack_dyn_relocs_relr && !(this->shdr.sh_flags & SHF_EXECINSTR))
     for (AbsRel<E> &r : abs_rels)
       if (r.kind == ABS_REL_BASEREL &&
           r.isec->shdr().sh_addralign % sizeof(Word<E>) == 0 &&
@@ -1192,7 +1143,7 @@ void OutputSection<E>::compute_symtab_size(Context<E> &ctx) {
         this->num_local_symtab += thunk->symbols.size();
 
       for (Symbol<E> *sym : thunk->symbols)
-        this->strtab_size += sym->name().size() + sizeof("$thunk");
+        this->strtab_size += sym->name().size() + thunk->name.size() + 2;
     }
   }
 }
@@ -1226,7 +1177,8 @@ void OutputSection<E>::populate_symtab(Context<E> &ctx) {
         write_esym(addr, strtab - strtab_base);
 
         strtab += write_string(strtab, sym.name()) - 1;
-        strtab += write_string(strtab, "$thunk");
+        *strtab++ = '$';
+        strtab += write_string(strtab, thunk->name);
 
         // Emit "$t", "$a" and "$d" if ARM32.
         if constexpr (is_arm32<E>) {
@@ -1334,12 +1286,6 @@ static std::vector<GotEntry<E>> get_got_entries(Context<E> &ctx) {
   for (Symbol<E> *sym : ctx.got->got_syms) {
     i64 idx = sym->get_got_idx(ctx);
 
-    // If a symbol is imported, let the dynamic linker to resolve it.
-    if (sym->is_imported) {
-      add({idx, 0, E::R_GLOB_DAT, sym});
-      continue;
-    }
-
     // IFUNC always needs to be fixed up by the dynamic linker.
     if constexpr (supports_ifunc<E>) {
       if (sym->is_ifunc()) {
@@ -1353,6 +1299,12 @@ static std::vector<GotEntry<E>> get_got_entries(Context<E> &ctx) {
       }
     }
 
+    // If a symbol is imported, let the dynamic linker to resolve it.
+    if (sym->is_imported) {
+      add({idx, 0, E::R_GLOB_DAT, sym});
+      continue;
+    }
+
     // If we know an address at link-time, fill that GOT entry now.
     // It may need a base relocation, though.
     if (ctx.arg.pic && sym->is_relative())
@@ -1446,7 +1398,7 @@ void GotSection<E>::copy_buf(Context<E> &ctx) {
     if (ctx.dynamic)
       buf[0] = ctx.dynamic->shdr.sh_addr;
 
-  // arm64 psABI doesn't say anything about GOT[0], but glibc/arm64's code
+  // ARM64 psABI doesn't say anything about GOT[0], but glibc/arm64's code
   // path for -static-pie wrongly assumed that GOT[0] refers to _DYNAMIC.
   //
   // https://sourceware.org/git/?p=glibc.git;a=commitdiff;h=43d06ed218fc8be5
@@ -1468,24 +1420,23 @@ void GotSection<E>::copy_buf(Context<E> &ctx) {
                        ent.sym ? ent.sym->get_dynsym_idx(ctx) : 0,
                        ent.val);
 
+    // A single TLSDESC relocation fixes two consecutive GOT slots
+    // where one slot holds a function pointer and the other an
+    // argument to the function. An addend should be applied not to
+    // the function pointer but to the function argument, which is
+    // usually stored to the second slot.
+    //
+    // ARM32 employs the inverted layout for some reason, so an
+    // addend is applied to the first slot.
     bool is_tlsdesc = false;
-    if constexpr (supports_tlsdesc<E>)
+    if constexpr (supports_tlsdesc<E> && !is_arm32<E>)
       is_tlsdesc = (ent.r_type == E::R_TLSDESC);
 
     if (ctx.arg.apply_dynamic_relocs) {
-      if (is_tlsdesc && !is_arm32<E>) {
-        // A single TLSDESC relocation fixes two consecutive GOT slots
-        // where one slot holds a function pointer and the other an
-        // argument to the function. An addend should be applied not to
-        // the function pointer but to the function argument, which is
-        // usually stored to the second slot.
-        //
-        // ARM32 employs the inverted layout for some reason, so an
-        // addend is applied to the first slot.
+      if (is_tlsdesc)
         buf[ent.idx + 1] = ent.val;
-      } else {
+      else
         buf[ent.idx] = ent.val;
-      }
     }
   }
 }
@@ -1647,6 +1598,7 @@ void PltSection<E>::populate_symtab(Context<E> &ctx) {
     memset(esym, 0, sizeof(*esym));
     esym->st_name = st_name;
     esym->st_type = STT_FUNC;
+    esym->st_bind = STB_LOCAL;
     esym->st_shndx = this->shndx;
     esym->st_value = addr;
     esym++;
@@ -1757,6 +1709,20 @@ void RelPltSection<E>::copy_buf(Context<E> &ctx) {
   }
 }
 
+// RISC-V and LoongArch have code-shrinking linker relaxation. If we
+// have removed instructions from a function, we need to update its
+// size as well.
+template <typename E>
+static u64 get_symbol_size(Symbol<E> &sym) {
+  const ElfSym<E> &esym = sym.esym();
+  if constexpr (is_riscv<E> || is_loongarch<E>)
+    if (esym.st_size > 0)
+      if (InputSection<E> *isec = sym.get_input_section())
+        return esym.st_size + esym.st_value - sym.value -
+               get_r_delta(*isec, esym.st_value + esym.st_size);
+  return esym.st_size;
+}
+
 template <typename E>
 ElfSym<E> to_output_esym(Context<E> &ctx, Symbol<E> &sym, u32 st_name,
                          U32<E> *shn_xindex) {
@@ -1765,7 +1731,7 @@ ElfSym<E> to_output_esym(Context<E> &ctx, Symbol<E> &sym, u32 st_name,
 
   esym.st_name = st_name;
   esym.st_type = sym.get_type();
-  esym.st_size = sym.esym().st_size;
+  esym.st_size = get_symbol_size(sym);
 
   if (sym.is_local(ctx))
     esym.st_bind = STB_LOCAL;
@@ -1895,18 +1861,15 @@ void DynsymSection<E>::update_shdr(Context<E> &ctx) {
 
 template <typename E>
 void DynsymSection<E>::copy_buf(Context<E> &ctx) {
-  u8 *base = ctx.buf + this->shdr.sh_offset;
-  memset(base, 0, sizeof(ElfSym<E>));
-  i64 name_offset = ctx.dynstr->dynsym_offset;
+  ElfSym<E> *buf = (ElfSym<E> *)(ctx.buf + this->shdr.sh_offset);
+  i64 offset = dynstr_offset;
+
+  memset(buf, 0, sizeof(ElfSym<E>));
 
   for (i64 i = 1; i < symbols.size(); i++) {
     Symbol<E> &sym = *symbols[i];
-    ElfSym<E> &esym =
-      *(ElfSym<E> *)(base + sym.get_dynsym_idx(ctx) * sizeof(ElfSym<E>));
-
-    esym = to_output_esym(ctx, sym, name_offset, nullptr);
-    name_offset += sym.name().size() + 1;
-    assert(esym.st_bind != STB_LOCAL || i < this->shdr.sh_info);
+    buf[sym.get_dynsym_idx(ctx)] = to_output_esym(ctx, sym, offset, nullptr);
+    offset += sym.name().size() + 1;
   }
 }
 
@@ -1915,9 +1878,9 @@ void HashSection<E>::update_shdr(Context<E> &ctx) {
   if (ctx.dynsym->symbols.empty())
     return;
 
-  i64 header_size = 8;
+  i64 header_size = sizeof(Entry) * 2;
   i64 num_slots = ctx.dynsym->symbols.size();
-  this->shdr.sh_size = header_size + num_slots * 8;
+  this->shdr.sh_size = header_size + num_slots * sizeof(Entry) * 2;
   this->shdr.sh_link = ctx.dynsym->shndx;
 }
 
@@ -1927,21 +1890,18 @@ void HashSection<E>::copy_buf(Context<E> &ctx) {
   memset(base, 0, this->shdr.sh_size);
 
   std::span<Symbol<E> *> syms = ctx.dynsym->symbols;
-  U32<E> *hdr = (U32<E> *)base;
-  U32<E> *buckets = (U32<E> *)(base + 8);
-  U32<E> *chains = buckets + syms.size();
+  Entry *hdr = (Entry *)base;
+  Entry *buckets = hdr + 2;
+  Entry *chains = buckets + syms.size();
 
-  hdr[0] = hdr[1] = syms.size();
+  hdr[0] = syms.size();
+  hdr[1] = syms.size();
 
-  std::vector<u32> hashes(syms.size());
-  tbb::parallel_for((i64)1, (i64)syms.size(), [&](i64 i) {
-    hashes[i] = elf_hash(syms[i]->name()) % syms.size();
-  });
-
-  for (i64 i = 1; i < syms.size(); i++) {
-    i64 h = hashes[i];
-    chains[syms[i]->get_dynsym_idx(ctx)] = buckets[h];
-    buckets[h] = syms[i]->get_dynsym_idx(ctx);
+  for (Symbol<E> *sym : syms.subspan(1)) {
+    i64 i = sym->get_dynsym_idx(ctx);
+    i64 h = elf_hash(sym->name()) % syms.size();
+    chains[i] = buckets[h];
+    buckets[h] = i;
   }
 }
 
@@ -1968,18 +1928,20 @@ void GnuHashSection<E>::copy_buf(Context<E> &ctx) {
 
   i64 first_exported = ctx.dynsym->symbols.size() - num_exported;
 
-  std::span<Symbol<E> *> syms = ctx.dynsym->symbols;
-  syms = syms.subspan(first_exported);
-
-  std::vector<u32> indices(num_exported);
-
   *(U32<E> *)base = num_buckets;
   *(U32<E> *)(base + 4) = first_exported;
   *(U32<E> *)(base + 8) = num_bloom;
   *(U32<E> *)(base + 12) = BLOOM_SHIFT;
 
+  std::span<Symbol<E> *> syms = ctx.dynsym->symbols;
+  syms = syms.subspan(first_exported);
+
+  if (syms.empty())
+    return;
+
   // Write a bloom filter
   Word<E> *bloom = (Word<E> *)(base + HEADER_SIZE);
+  std::vector<u32> indices(num_exported);
 
   for (i64 i = 0; i < syms.size(); i++) {
     constexpr i64 word_bits = sizeof(Word<E>) * 8;
@@ -1995,9 +1957,8 @@ void GnuHashSection<E>::copy_buf(Context<E> &ctx) {
   // Write hash bucket indices
   U32<E> *buckets = (U32<E> *)(bloom + num_bloom);
 
-  for (i64 i = 0; i < syms.size(); i++)
-    if (!buckets[indices[i]])
-      buckets[indices[i]] = i + first_exported;
+  for (i64 i = syms.size() - 1; i >= 0; i--)
+    buckets[indices[i]] = first_exported + i;
 
   // Write a hash table
   U32<E> *table = buckets + num_buckets;
@@ -2022,14 +1983,22 @@ get_merged_output_name(Context<E> &ctx, std::string_view name, u64 flags,
   if (ctx.arg.unique && ctx.arg.unique->match(name))
     return name;
 
-  // GCC seems to create sections named ".rodata.strN.<mangled-symbol-name>.M".
-  // We want to eliminate the symbol name part from the section name.
-  if ((flags & SHF_STRINGS) && name.starts_with(".rodata.")) {
-    std::string name2 = ".rodata.str"s + std::to_string(entsize) +
-                        "." + std::to_string(addralign);
-    if (name == name2)
-      return name;
-    return save_string(ctx, name2);
+  // GCC seems to create sections named ".rodata.strN.<mangled-symbol-name>.M"
+  // or ".rodata.cst.<mangled-symbol-name.cstN". We want to eliminate the
+  // symbol name part from the section name.
+  if (name.starts_with(".rodata.")) {
+    if (flags & SHF_STRINGS) {
+      std::string name2 = ".rodata.str"s + std::to_string(entsize) +
+                          "." + std::to_string(addralign);
+      if (name == name2)
+        return name;
+      return save_string(ctx, name2);
+    } else {
+      std::string name2 = ".rodata.cst"s + std::to_string(entsize);
+      if (name == name2)
+        return name;
+      return save_string(ctx, name2);
+    }
   }
 
   return name;
@@ -2146,6 +2115,13 @@ void MergedSection<E>::resolve(Context<E> &ctx) {
 
   if (this == ctx.comment)
     add_comment_strings(ctx);
+
+  // Compute section alignment
+  u32 p2align = 0;
+  for (MergeableSection<E> *sec : members)
+    p2align = std::max<u32>(p2align, sec->p2align);
+  this->shdr.sh_addralign = 1 << p2align;
+
   resolved = true;
 }
 
@@ -2155,14 +2131,11 @@ void MergedSection<E>::compute_section_size(Context<E> &ctx) {
     resolve(ctx);
 
   std::vector<i64> sizes(map.NUM_SHARDS);
-  Atomic<i64> alignment = 1;
 
   tbb::parallel_for((i64)0, map.NUM_SHARDS, [&](i64 i) {
     using Entry = typename decltype(map)::Entry;
     std::vector<Entry *> entries = map.get_sorted_entries(i);
-
     i64 offset = 0;
-    i64 p2align = 0;
 
     for (Entry *ent : entries) {
       SectionFragment<E> &frag = ent->value;
@@ -2170,15 +2143,9 @@ void MergedSection<E>::compute_section_size(Context<E> &ctx) {
         offset = align_to(offset, 1 << frag.p2align);
         frag.offset = offset;
         offset += ent->keylen;
-        p2align = std::max<i64>(p2align, frag.p2align);
       }
     }
-
     sizes[i] = offset;
-    update_maximum(alignment, 1 << p2align);
-
-    static Counter merged_strings("merged_strings");
-    merged_strings += entries.size();
   });
 
   i64 shard_size = map.nbuckets / map.NUM_SHARDS;
@@ -2186,7 +2153,7 @@ void MergedSection<E>::compute_section_size(Context<E> &ctx) {
 
   for (i64 i = 1; i < map.NUM_SHARDS + 1; i++)
     shard_offsets[i] =
-      align_to(shard_offsets[i - 1] + sizes[i - 1], alignment);
+      align_to(shard_offsets[i - 1] + sizes[i - 1], this->shdr.sh_addralign);
 
   tbb::parallel_for((i64)1, map.NUM_SHARDS, [&](i64 i) {
     for (i64 j = shard_size * i; j < shard_size * (i + 1); j++) {
@@ -2197,7 +2164,6 @@ void MergedSection<E>::compute_section_size(Context<E> &ctx) {
   });
 
   this->shdr.sh_size = shard_offsets[map.NUM_SHARDS];
-  this->shdr.sh_addralign = alignment;
 
   if (this->shdr.sh_size > UINT32_MAX)
     Fatal(ctx) << this->name << ": output section too large";
@@ -2205,17 +2171,18 @@ void MergedSection<E>::compute_section_size(Context<E> &ctx) {
 
 template <typename E>
 void MergedSection<E>::copy_buf(Context<E> &ctx) {
-  write_to(ctx, ctx.buf + this->shdr.sh_offset, nullptr);
+  write_to(ctx, ctx.buf + this->shdr.sh_offset);
 }
 
 template <typename E>
-void MergedSection<E>::write_to(Context<E> &ctx, u8 *buf, ElfRel<E> *rel) {
+void MergedSection<E>::write_to(Context<E> &ctx, u8 *buf) {
   i64 shard_size = map.nbuckets / map.NUM_SHARDS;
 
   tbb::parallel_for((i64)0, map.NUM_SHARDS, [&](i64 i) {
     // There might be gaps between strings to satisfy alignment requirements.
     // If that's the case, we need to zero-clear them.
-    if (this->shdr.sh_addralign > 1)
+    if (this->shdr.sh_addralign > 1 &&
+        this->shdr.sh_addralign != this->shdr.sh_entsize)
       memset(buf + shard_offsets[i], 0, shard_offsets[i + 1] - shard_offsets[i]);
 
     // Copy strings
@@ -2244,7 +2211,7 @@ void EhFrameSection<E>::construct(Context<E> &ctx) {
 
   // Remove dead FDEs and assign them offsets within their corresponding
   // CIE group.
-  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
+  tbb::parallel_for_each(ctx.objs, [](ObjectFile<E> *file) {
     std::erase_if(file->fdes, [](FdeRecord<E> &fde) { return !fde.is_alive; });
 
     i64 offset = 0;
@@ -2305,7 +2272,7 @@ void EhFrameSection<E>::copy_buf(Context<E> &ctx) {
   HdrEntry *eh_hdr = nullptr;
   if (ctx.eh_frame_hdr)
     eh_hdr = (HdrEntry *)(ctx.buf + ctx.eh_frame_hdr->shdr.sh_offset +
-                   EhFrameHdrSection<E>::HEADER_SIZE);
+                          EhFrameHdrSection<E>::HEADER_SIZE);
 
   tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
     // Copy CIEs.
@@ -2344,24 +2311,22 @@ void EhFrameSection<E>::copy_buf(Context<E> &ctx) {
       if (ctx.arg.relocatable)
         continue;
 
-      for (const ElfRel<E> &rel : rels) {
+      for (i64 j = 0; j < rels.size(); j++) {
+        const ElfRel<E> &rel = rels[j];
         assert(rel.r_offset - fde.input_offset < contents.size());
 
         Symbol<E> &sym = *file->symbols[rel.r_sym];
         u64 loc = offset + rel.r_offset - fde.input_offset;
         u64 val = sym.get_addr(ctx) + get_addend(cie.input_section, rel);
         apply_eh_reloc(ctx, rel, loc, val);
-      }
 
-      if (eh_hdr) {
-        // Write to .eh_frame_hdr
-        Symbol<E> &sym = *file->symbols[rels[0].r_sym];
-        u64 val = sym.get_addr(ctx) + get_addend(cie.input_section, rels[0]);
-        u64 sh_addr = ctx.eh_frame_hdr->shdr.sh_addr;
-
-        HdrEntry &ent = eh_hdr[file->fde_idx + i];
-        ent.init_addr = val - sh_addr;
-        ent.fde_addr = this->shdr.sh_addr + offset - sh_addr;
+        if (j == 0 && eh_hdr) {
+          // Write to .eh_frame_hdr
+          HdrEntry &ent = eh_hdr[file->fde_idx + i];
+          u64 origin = ctx.eh_frame_hdr->shdr.sh_addr;
+          ent.init_addr = val - origin;
+          ent.fde_addr = this->shdr.sh_addr + offset - origin;
+        }
       }
     }
   });
@@ -2390,7 +2355,7 @@ template <typename E>
 void EhFrameHdrSection<E>::copy_buf(Context<E> &ctx) {
   u8 *base = ctx.buf + this->shdr.sh_offset;
 
-  // Write a header. The actual table is written by EhFrameHdr<E>::copy_buf.
+  // Write a header. The actual table is written by EhFrameSection::copy_buf.
   base[0] = 1;
   base[1] = DW_EH_PE_pcrel | DW_EH_PE_sdata4;
   base[2] = DW_EH_PE_udata4;
@@ -2549,13 +2514,12 @@ void VersymSection<E>::copy_buf(Context<E> &ctx) {
 // `GLIBC_ABI_DT_RELR' not found" error message. glibc 2.38 or later knows
 // about this dummy version name and simply ignores it.
 template <typename E>
-static InputFile<E> *find_glibc2(Context<E> &ctx) {
-  for (Symbol<E> *sym : ctx.dynsym->symbols)
-    if (sym && sym->file->is_dso &&
-        ((SharedFile<E> *)sym->file)->soname.starts_with("libc.so.") &&
-        sym->get_version().starts_with("GLIBC_2."))
-      return sym->file;
-  return nullptr;
+static bool is_glibc2(SharedFile<E> &file) {
+  if (file.soname.starts_with("libc.so."))
+    for (std::string_view str : file.version_strings)
+      if (str.starts_with("GLIBC_2."))
+        return true;
+  return false;
 }
 
 template <typename E>
@@ -2595,19 +2559,6 @@ void VerneedSection<E>::construct(Context<E> &ctx) {
 
   i64 veridx = VER_NDX_LAST_RESERVED + ctx.arg.version_definitions.size();
 
-  auto start_group = [&](InputFile<E> *file) {
-    this->shdr.sh_info++;
-    if (verneed)
-      verneed->vn_next = ptr - (u8 *)verneed;
-
-    verneed = (ElfVerneed<E> *)ptr;
-    ptr += sizeof(ElfVerneed<E>);
-    verneed->vn_version = 1;
-    verneed->vn_file = ctx.dynstr->find_string(((SharedFile<E> *)file)->soname);
-    verneed->vn_aux = sizeof(ElfVerneed<E>);
-    aux = nullptr;
-  };
-
   auto add_entry = [&](std::string_view verstr) {
     verneed->vn_cnt++;
 
@@ -2621,25 +2572,33 @@ void VerneedSection<E>::construct(Context<E> &ctx) {
     aux->vna_name = ctx.dynstr->add_string(verstr);
   };
 
+  auto start_group = [&](SharedFile<E> &file) {
+    this->shdr.sh_info++;
+    if (verneed)
+      verneed->vn_next = ptr - (u8 *)verneed;
+
+    verneed = (ElfVerneed<E> *)ptr;
+    ptr += sizeof(ElfVerneed<E>);
+    verneed->vn_version = 1;
+    verneed->vn_file = ctx.dynstr->find_string(file.soname);
+    verneed->vn_aux = sizeof(ElfVerneed<E>);
+    aux = nullptr;
+
+    if (ctx.arg.pack_dyn_relocs_relr && is_glibc2(file))
+      add_entry("GLIBC_ABI_DT_RELR");
+  };
+
   // Create version entries.
   for (i64 i = 0; i < syms.size(); i++) {
     if (i == 0 || syms[i - 1]->file != syms[i]->file) {
-      start_group(syms[i]->file);
+      start_group(*(SharedFile<E> *)syms[i]->file);
       add_entry(syms[i]->get_version());
     } else if (syms[i - 1]->ver_idx != syms[i]->ver_idx) {
       add_entry(syms[i]->get_version());
     }
-
     ctx.versym->contents[syms[i]->get_dynsym_idx(ctx)] = veridx;
   }
 
-  if (ctx.arg.pack_dyn_relocs_relr) {
-    if (InputFile<E> *file = find_glibc2(ctx)) {
-      start_group(file);
-      add_entry("GLIBC_ABI_DT_RELR");
-    }
-  }
-
   // Resize .gnu.version_r to fit to its contents.
   contents.resize(ptr - buf);
 }
@@ -2698,10 +2657,10 @@ void VerdefSection<E>::construct(Context<E> &ctx) {
     aux->vda_name = ctx.dynstr->add_string(verstr);
   };
 
-  if (!ctx.arg.soname.empty())
-    write(ctx.arg.soname, 1, VER_FLG_BASE);
-  else
-    write(ctx.arg.output, 1, VER_FLG_BASE);
+  std::string_view soname = ctx.arg.soname;
+  if (soname.empty())
+    soname = save_string(ctx, path_filename(ctx.arg.output));
+  write(soname, 1, VER_FLG_BASE);
 
   i64 idx = VER_NDX_LAST_RESERVED + 1;
   for (std::string_view verstr : ctx.arg.version_definitions)
@@ -2759,13 +2718,6 @@ void NotePackageSection<E>::copy_buf(Context<E> &ctx) {
 // Merges input files' .note.gnu.property values.
 template <typename E>
 void NotePropertySection<E>::update_shdr(Context<E> &ctx) {
-  // The rules we support are only specified for x86 psABI
-  if (!is_x86<E>)
-    return;
-
-  // Reset to the initial state so that this function is idempotent
-  properties.clear();
-
   // Obtain the list of keys
   std::vector<ObjectFile<E> *> files = ctx.objs;
   std::erase(files, ctx.internal_obj);
@@ -2783,6 +2735,8 @@ void NotePropertySection<E>::update_shdr(Context<E> &ctx) {
   };
 
   // Merge values for each key
+  std::map<u32, u32> map;
+
   for (u32 key : keys) {
     auto has_key = [&](ObjectFile<E> *file) {
       return file->gnu_properties.contains(key);
@@ -2792,41 +2746,41 @@ void NotePropertySection<E>::update_shdr(Context<E> &ctx) {
         key <= GNU_PROPERTY_X86_UINT32_AND_HI) {
       // An AND feature is set if all input objects have the property and
       // the feature.
-      if (std::all_of(files.begin(), files.end(), has_key)) {
-        properties[key] = 0xffff'ffff;
-        for (ObjectFile<E> *file : files)
-          properties[key] &= get_value(file, key);
-      }
+      map[key] = 0xffff'ffff;
+      for (ObjectFile<E> *file : files)
+        map[key] &= get_value(file, key);
     } else if (GNU_PROPERTY_X86_UINT32_OR_LO <= key &&
                key <= GNU_PROPERTY_X86_UINT32_OR_HI) {
       // An OR feature is set if some input object has the feature.
       for (ObjectFile<E> *file : files)
-        properties[key] |= get_value(file, key);
+        map[key] |= get_value(file, key);
     } else if (GNU_PROPERTY_X86_UINT32_OR_AND_LO <= key &&
                key <= GNU_PROPERTY_X86_UINT32_OR_AND_HI) {
       // An OR-AND feature is set if all input object files have the property
       // and some of them has the feature.
       if (std::all_of(files.begin(), files.end(), has_key))
         for (ObjectFile<E> *file : files)
-          properties[key] |= get_value(file, key);
+          map[key] |= get_value(file, key);
     }
   }
 
   if (ctx.arg.z_ibt)
-    properties[GNU_PROPERTY_X86_FEATURE_1_AND] |= GNU_PROPERTY_X86_FEATURE_1_IBT;
+    map[GNU_PROPERTY_X86_FEATURE_1_AND] |= GNU_PROPERTY_X86_FEATURE_1_IBT;
   if (ctx.arg.z_shstk)
-    properties[GNU_PROPERTY_X86_FEATURE_1_AND] |= GNU_PROPERTY_X86_FEATURE_1_SHSTK;
+    map[GNU_PROPERTY_X86_FEATURE_1_AND] |= GNU_PROPERTY_X86_FEATURE_1_SHSTK;
+  map[GNU_PROPERTY_X86_ISA_1_NEEDED] |= ctx.arg.z_x86_64_isa_level;
 
-  properties[GNU_PROPERTY_X86_ISA_1_NEEDED] |= ctx.arg.z_x86_64_isa_level;
+  // Serialize the map
+  contents.clear();
 
-  std::erase_if(properties, [](std::pair<u32, u32> kv) {
-    return kv.second == 0;
-  });
+  for (std::pair<u32, u32> kv : map)
+    if (kv.second)
+      contents.push_back({kv.first, 4, kv.second});
 
-  if (properties.empty())
+  if (contents.empty())
     this->shdr.sh_size = 0;
   else
-    this->shdr.sh_size = 16 + ENTRY_SIZE * properties.size();
+    this->shdr.sh_size = 16 + contents.size() * sizeof(contents[0]);
 }
 
 template <typename E>
@@ -2834,18 +2788,11 @@ void NotePropertySection<E>::copy_buf(Context<E> &ctx) {
   U32<E> *buf = (U32<E> *)(ctx.buf + this->shdr.sh_offset);
   memset(buf, 0, this->shdr.sh_size);
 
-  buf[0] = 4;                              // Name size
-  buf[1] = ENTRY_SIZE * properties.size(); // Content size
-  buf[2] = NT_GNU_PROPERTY_TYPE_0;         // Type
-  memcpy(buf + 3, "GNU", 4);               // Name
-
-  i64 idx = 4;
-  for (std::pair<u32, u32> kv : properties) {
-    buf[idx] = kv.first;                   // Feature type
-    buf[idx + 1] = 4;                      // Feature size
-    buf[idx + 2] = kv.second;              // Feature flags
-    idx += ENTRY_SIZE / sizeof(U32<E>);
-  }
+  buf[0] = 4;                       // Name size
+  buf[1] = this->shdr.sh_size - 16; // Content size
+  buf[2] = NT_GNU_PROPERTY_TYPE_0;  // Type
+  memcpy(buf + 3, "GNU", 4);        // Name
+  write_vector(buf + 4, contents);  // Content
 }
 
 template <typename E>
@@ -2857,7 +2804,7 @@ CompressedSection<E>::CompressedSection(Context<E> &ctx, Chunk<E> &chunk) {
   this->uncompressed_data.resize(chunk.shdr.sh_size);
   u8 *buf = this->uncompressed_data.data();
 
-  chunk.write_to(ctx, buf, nullptr);
+  chunk.write_to(ctx, buf);
 
   switch (ctx.arg.compress_debug_sections) {
   case COMPRESS_ZLIB:
@@ -3013,7 +2960,7 @@ void ComdatGroupSection<E>::copy_buf(Context<E> &ctx) {
 
 template <typename E>
 void GnuDebuglinkSection<E>::update_shdr(Context<E> &ctx) {
-  filename = std::filesystem::path(ctx.arg.separate_debug_file).filename().string();
+  filename = path_filename(ctx.arg.separate_debug_file);
   this->shdr.sh_size = align_to(filename.size() + 1, 4) + 4;
 }
 
diff --git a/src/output-file-unix.cc b/src/output-file-unix.cc
index 35823d4a30..1a338f07b0 100644
--- a/src/output-file-unix.cc
+++ b/src/output-file-unix.cc
@@ -4,8 +4,7 @@
 #include <filesystem>
 #include <sys/file.h>
 #include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
+#include <system_error>
 
 namespace mold {
 
@@ -111,14 +110,9 @@ OutputFile<E>::open(Context<E> &ctx, std::string path, i64 filesize, int perm) {
   if (path.starts_with('/') && !ctx.arg.chroot.empty())
     path = ctx.arg.chroot + "/" + path_clean(path);
 
-  bool is_special = false;
-  if (path == "-") {
-    is_special = true;
-  } else {
-    struct stat st;
-    if (stat(path.c_str(), &st) == 0 && (st.st_mode & S_IFMT) != S_IFREG)
-      is_special = true;
-  }
+  std::error_code error;
+  bool is_special = path == "-" ||
+                    (!std::filesystem::is_regular_file(path, error) && !error);
 
   OutputFile<E> *file;
   if (is_special)
diff --git a/src/output-file-win32.cc b/src/output-file-win32.cc
index 68bd26c8a5..509a98d891 100644
--- a/src/output-file-win32.cc
+++ b/src/output-file-win32.cc
@@ -1,7 +1,6 @@
 #include "mold.h"
 
 #include <fcntl.h>
-#include <filesystem>
 #include <windows.h>
 
 namespace mold {
diff --git a/src/passes.cc b/src/passes.cc
index 10e932d452..10c533cf34 100644
--- a/src/passes.cc
+++ b/src/passes.cc
@@ -3,13 +3,12 @@
 
 #include <fstream>
 #include <functional>
-#include <map>
 #include <optional>
 #include <regex>
 #include <shared_mutex>
+#include <tbb/enumerable_thread_specific.h>
 #include <tbb/parallel_for_each.h>
 #include <tbb/parallel_sort.h>
-#include <tbb/partitioner.h>
 #include <unordered_set>
 
 namespace mold {
@@ -23,8 +22,10 @@ int redo_main(Context<E> &ctx, int argc, char **argv) {
 
   if (target == I386::name)
     return mold_main<I386>(argc, argv);
-  if (target == ARM64::name)
-    return mold_main<ARM64>(argc, argv);
+  if (target == ARM64LE::name)
+    return mold_main<ARM64LE>(argc, argv);
+  if (target == ARM64BE::name)
+    return mold_main<ARM64BE>(argc, argv);
   if (target == ARM32::name)
     return mold_main<ARM32>(argc, argv);
   if (target == RV64LE::name)
@@ -47,13 +48,15 @@ int redo_main(Context<E> &ctx, int argc, char **argv) {
     return mold_main<SPARC64>(argc, argv);
   if (target == M68K::name)
     return mold_main<M68K>(argc, argv);
-  if (target == SH4::name)
-    return mold_main<SH4>(argc, argv);
+  if (target == SH4LE::name)
+    return mold_main<SH4LE>(argc, argv);
+  if (target == SH4BE::name)
+    return mold_main<SH4BE>(argc, argv);
   if (target == LOONGARCH32::name)
     return mold_main<LOONGARCH32>(argc, argv);
   if (target == LOONGARCH64::name)
     return mold_main<LOONGARCH64>(argc, argv);
-  unreachable();
+  abort();
 }
 
 // --exclude-libs=libraries ...: Mark all symbols in the given libraries hidden.
@@ -72,17 +75,11 @@ void apply_exclude_libs(Context<E> &ctx) {
   std::unordered_set<std::string_view> set(ctx.arg.exclude_libs.begin(),
                                            ctx.arg.exclude_libs.end());
 
-  if (set.contains("ALL")) {
+  if (!set.empty())
     for (ObjectFile<E> *file : ctx.objs)
       if (!file->archive_name.empty())
-        file->exclude_libs = true;
-    return;
-  }
-
-  for (ObjectFile<E> *file : ctx.objs)
-    if (!file->archive_name.empty())
-      if (set.contains(path_filename(file->archive_name)))
-        file->exclude_libs = true;
+        if (set.contains(path_filename(file->archive_name)) || set.contains("ALL"))
+          file->exclude_libs = true;
 }
 
 template <typename E>
@@ -174,14 +171,13 @@ void create_synthetic_sections(Context<E> &ctx) {
 
     // If .dynamic exists, .dynsym and .dynstr must exist as well
     // since .dynamic refers to them.
-    ctx.dynstr->shdr.sh_size = 1;
+    ctx.dynstr->add_string("");
     ctx.dynsym->symbols.resize(1);
   }
 
   ctx.versym = push(new VersymSection<E>);
   ctx.verneed = push(new VerneedSection<E>);
   ctx.note_package = push(new NotePackageSection<E>);
-  ctx.note_property = push(new NotePropertySection<E>);
 
   if (!ctx.arg.oformat_binary) {
     ElfShdr<E> shdr = {};
@@ -190,6 +186,9 @@ void create_synthetic_sections(Context<E> &ctx) {
     ctx.comment = MergedSection<E>::get_instance(ctx, ".comment", shdr);
   }
 
+  if constexpr (is_x86<E>)
+    ctx.extra.note_property = push(new NotePropertySection<E>);
+
   if constexpr (is_riscv<E>)
     ctx.extra.riscv_attributes = push(new RiscvAttributesSection<E>);
 
@@ -204,18 +203,18 @@ template <typename E>
 static void mark_live_objects(Context<E> &ctx) {
   for (Symbol<E> *sym : ctx.arg.undefined)
     if (sym->file)
-      sym->file->is_alive = true;
+      sym->file->is_reachable = true;
 
   for (Symbol<E> *sym : ctx.arg.require_defined)
     if (sym->file)
-      sym->file->is_alive = true;
+      sym->file->is_reachable = true;
 
   if (!ctx.arg.undefined_glob.empty()) {
     tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
-      if (!file->is_alive) {
+      if (!file->is_reachable) {
         for (Symbol<E> *sym : file->get_global_syms()) {
           if (sym->file == file && ctx.arg.undefined_glob.find(sym->name())) {
-            file->is_alive = true;
+            file->is_reachable = true;
             sym->gc_root = true;
             break;
           }
@@ -227,11 +226,11 @@ static void mark_live_objects(Context<E> &ctx) {
   std::vector<InputFile<E> *> roots;
 
   for (InputFile<E> *file : ctx.objs)
-    if (file->is_alive)
+    if (file->is_reachable)
       roots.push_back(file);
 
   for (InputFile<E> *file : ctx.dsos)
-    if (file->is_alive)
+    if (file->is_reachable)
       roots.push_back(file);
 
   tbb::parallel_for_each(roots, [&](InputFile<E> *file,
@@ -298,13 +297,13 @@ void resolve_symbols(Context<E> &ctx) {
     // we could eliminate a symbol that is already resolved to and cause
     // dangling references.
     tbb::parallel_for_each(ctx.objs, [](ObjectFile<E> *file) {
-      if (file->is_alive)
+      if (file->is_reachable)
         for (ComdatGroupRef<E> &ref : file->comdat_groups)
           update_minimum(ref.group->owner, file->priority);
     });
 
     tbb::parallel_for_each(ctx.objs, [](ObjectFile<E> *file) {
-      if (file->is_alive)
+      if (file->is_reachable)
         for (ComdatGroupRef<E> &ref : file->comdat_groups)
           if (ref.group->owner != file->priority)
             for (u32 i : ref.members)
@@ -314,7 +313,7 @@ void resolve_symbols(Context<E> &ctx) {
 
     // Redo symbol resolution
     tbb::parallel_for_each(files, [&](InputFile<E> *file) {
-      if (file->is_alive)
+      if (file->is_reachable)
         file->resolve_symbols(ctx);
     });
 
@@ -325,7 +324,7 @@ void resolve_symbols(Context<E> &ctx) {
     std::atomic_bool flag = false;
 
     tbb::parallel_for_each(ctx.dsos, [&](SharedFile<E> *file) {
-      if (file->is_alive) {
+      if (file->is_reachable) {
         for (Symbol<E> *sym : file->symbols) {
           if (sym->file == file && sym->visibility == STV_HIDDEN) {
             sym->skip_dso = true;
@@ -367,7 +366,7 @@ void do_lto(Context<E> &ctx) {
   // Remove IR object files.
   for (ObjectFile<E> *file : ctx.objs)
     if (file->is_lto_obj)
-      file->is_alive = false;
+      file->is_reachable = false;
 
   std::erase_if(ctx.objs, [](ObjectFile<E> *file) { return file->is_lto_obj; });
 
@@ -559,6 +558,9 @@ static bool is_relro(OutputSection<E> &osec) {
 }
 
 // Create output sections for input sections.
+//
+// Since one output section could contain millions of input sections,
+// we need to do it efficiently.
 template <typename E>
 void create_output_sections(Context<E> &ctx) {
   Timer t(ctx, "create_output_sections");
@@ -567,18 +569,14 @@ void create_output_sections(Context<E> &ctx) {
                                      OutputSectionKey::Hash>;
   MapType map;
   std::shared_mutex mu;
-  i64 size = ctx.osec_pool.size();
   bool ctors_in_init_array = has_ctors_and_init_array(ctx);
+  tbb::enumerable_thread_specific<MapType> caches;
 
   // Instantiate output sections
   tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
     // Make a per-thread cache of the main map to avoid lock contention.
     // It makes a noticeable difference if we have millions of input sections.
-    MapType cache;
-    {
-      std::shared_lock lock(mu);
-      cache = map;
-    }
+    MapType &cache = caches.local();
 
     for (std::unique_ptr<InputSection<E>> &isec : file->sections) {
       if (!isec || !isec->is_alive)
@@ -631,22 +629,40 @@ void create_output_sections(Context<E> &ctx) {
     }
   });
 
+  // Add input sections to output sections
+  for (std::unique_ptr<OutputSection<E>> &osec : ctx.osec_pool)
+    osec->members_vec.resize(ctx.objs.size());
+
+  tbb::parallel_for((i64)0, (i64)ctx.objs.size(), [&](i64 i) {
+    for (std::unique_ptr<InputSection<E>> &isec : ctx.objs[i]->sections)
+      if (isec && isec->output_section)
+        isec->output_section->members_vec[i].push_back(isec.get());
+  });
+
+  // Compute section alignment
+  for (std::unique_ptr<OutputSection<E>> &osec : ctx.osec_pool) {
+    Atomic<u32> p2align;
+    tbb::parallel_for((i64)0, (i64)ctx.objs.size(), [&](i64 i) {
+      u32 x = 0;
+      for (InputSection<E> *isec : osec->members_vec[i])
+        x = std::max<u32>(x, isec->p2align);
+      update_maximum(p2align, x);
+    });
+    osec->shdr.sh_addralign = 1 << p2align;
+  }
+
   for (std::unique_ptr<OutputSection<E>> &osec : ctx.osec_pool) {
     osec->shdr.sh_flags = osec->sh_flags;
     osec->is_relro = is_relro(*osec);
+    osec->members = flatten(osec->members_vec);
+    osec->members_vec.clear();
+    osec->members_vec.shrink_to_fit();
   }
 
-  // Add input sections to output sections
-  std::vector<Chunk<E> *> chunks;
-  for (i64 i = size; i < ctx.osec_pool.size(); i++)
-    chunks.push_back(ctx.osec_pool[i].get());
-
-  for (ObjectFile<E> *file : ctx.objs)
-    for (std::unique_ptr<InputSection<E>> &isec : file->sections)
-      if (isec && isec->is_alive)
-        isec->output_section->members.push_back(isec.get());
-
   // Add output sections and mergeable sections to ctx.chunks
+  std::vector<Chunk<E> *> chunks;
+  for (std::unique_ptr<OutputSection<E>> &osec : ctx.osec_pool)
+    chunks.push_back(osec.get());
   for (std::unique_ptr<MergedSection<E>> &osec : ctx.merged_sections)
     chunks.push_back(osec.get());
 
@@ -675,7 +691,7 @@ void create_internal_file(Context<E> &ctx) {
 
   obj->symbols.push_back(new Symbol<E>);
   obj->first_global = 1;
-  obj->is_alive = true;
+  obj->is_reachable = true;
   obj->priority = 1;
 
   auto add = [&](Symbol<E> *sym) {
@@ -706,7 +722,6 @@ void create_internal_file(Context<E> &ctx) {
       add(get_symbol(ctx, ord.name));
 
   obj->elf_syms = ctx.internal_esyms;
-  obj->has_symver.resize(ctx.internal_esyms.size() - 1);
 }
 
 template <typename E>
@@ -832,8 +847,6 @@ void add_synthetic_symbols(Context<E> &ctx) {
         add(label);
 
   obj.elf_syms = ctx.internal_esyms;
-  obj.has_symver.resize(ctx.internal_esyms.size() - 1);
-
   obj.resolve_symbols(ctx);
 
   // Make all synthetic symbols relative ones by associating them to
@@ -872,11 +885,12 @@ void add_synthetic_symbols(Context<E> &ctx) {
 
 template <typename E>
 void apply_section_align(Context<E> &ctx) {
-  for (Chunk<E> *chunk : ctx.chunks)
-    if (OutputSection<E> *osec = chunk->to_osec())
-      if (auto it = ctx.arg.section_align.find(osec->name);
-          it != ctx.arg.section_align.end())
-        osec->shdr.sh_addralign = it->second;
+  std::unordered_map<std::string_view, u64> &map = ctx.arg.section_align;
+  if (!map.empty())
+    for (Chunk<E> *chunk : ctx.chunks)
+      if (OutputSection<E> *osec = chunk->to_osec())
+        if (auto it = map.find(osec->name); it != map.end())
+          osec->shdr.sh_addralign = it->second;
 }
 
 template <typename E>
@@ -1081,7 +1095,7 @@ void check_shlib_undefined(Context<E> &ctx) {
     // Skip the file if it depends on a file that we know nothing about.
     // This is because missing symbols may be provided by that unknown file.
     for (std::string_view needed : file->get_dt_needed(ctx))
-      if (sonames.count(needed) == 0)
+      if (!sonames.contains(needed))
         return;
 
     // Check if all undefined symbols have been resolved.
@@ -1094,6 +1108,22 @@ void check_shlib_undefined(Context<E> &ctx) {
                    << sym;
     }
   });
+
+  // Beyond this point, DSOs that are not referenced directly by any
+  // object file are not needed. They were kept by
+  // SharedFile<E>::mark_live_objects just for this pass. Therefore,
+  // remove unneeded DSOs from the list now.
+  for (SharedFile<E> *file : ctx.dsos)
+    file->is_reachable = false;
+
+  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
+    for (Symbol<E> *sym : file->get_global_syms())
+      if (InputFile<E> *file = sym->file)
+        if (file->is_dso)
+          file->is_reachable.test_and_set();
+  });
+
+  std::erase_if(ctx.dsos, [](SharedFile<E> *file) { return !file->is_reachable; });
 }
 
 template <typename E>
@@ -1191,7 +1221,7 @@ void sort_init_fini(Context<E> &ctx) {
             vec.push_back({isec, get_init_fini_priority(isec)});
         }
 
-        sort(vec, [&](const Entry &a, const Entry &b) { return a.prio < b.prio; });
+        sort(vec, [](const Entry &a, const Entry &b) { return a.prio < b.prio; });
 
         for (i64 i = 0; i < vec.size(); i++)
           osec->members[i] = vec[i].sect;
@@ -1219,7 +1249,7 @@ void sort_ctor_dtor(Context<E> &ctx) {
         for (InputSection<E> *isec : osec->members)
           vec.push_back({isec, get_ctor_dtor_priority(isec)});
 
-        sort(vec, [&](const Entry &a, const Entry &b) { return a.prio < b.prio; });
+        sort(vec, [](const Entry &a, const Entry &b) { return a.prio < b.prio; });
 
         for (i64 i = 0; i < vec.size(); i++)
           osec->members[i] = vec[i].sect;
@@ -1242,7 +1272,7 @@ template <typename E>
 void fixup_ctors_in_init_array(Context<E> &ctx) {
   Timer t(ctx, "fixup_ctors_in_init_array");
 
-  auto reverse = [&](InputSection<E> &isec) {
+  auto reverse_contents = [&](InputSection<E> &isec) {
     if (isec.sh_size % sizeof(Word<E>))
       Fatal(ctx) << isec << ": section corrupted";
 
@@ -1252,20 +1282,23 @@ void fixup_ctors_in_init_array(Context<E> &ctx) {
     std::span<ElfRel<E>> rels = isec.get_rels(ctx);
     for (ElfRel<E> &r : rels)
       r.r_offset = isec.sh_size - r.r_offset - sizeof(Word<E>);
-    std::reverse(rels.begin(), rels.end());
+
+    sort(rels, [](const ElfRel<E> &a, const ElfRel<E> &b) {
+      return a.r_offset < b.r_offset;
+    });
   };
 
   if (Chunk<E> *chunk = find_chunk(ctx, ".init_array"))
     if (OutputSection<E> *osec = chunk->to_osec())
       for (InputSection<E> *isec : osec->members)
         if (isec->name().starts_with(".ctors"))
-          reverse(*isec);
+          reverse_contents(*isec);
 
   if (Chunk<E> *chunk = find_chunk(ctx, ".fini_array"))
     if (OutputSection<E> *osec = chunk->to_osec())
       for (InputSection<E> *isec : osec->members)
         if (isec->name().starts_with(".dtors"))
-          reverse(*isec);
+          reverse_contents(*isec);
 }
 
 template <typename T>
@@ -1335,16 +1368,19 @@ void compute_section_sizes(Context<E> &ctx) {
   Timer t(ctx, "compute_section_sizes");
 
   if constexpr (needs_thunk<E>) {
-    // We cannot use parallel-for for compute_section_size() which may
-    // call create_range_extension_thunks() because that function is
-    // not thread-safe.
-    for (Chunk<E> *chunk : ctx.chunks)
-      if (chunk->shdr.sh_flags & SHF_EXECINSTR)
-        chunk->compute_section_size(ctx);
+    std::vector<Chunk<E> *> vec = ctx.chunks;
 
-    tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
-      if (!(chunk->shdr.sh_flags & SHF_EXECINSTR))
-        chunk->compute_section_size(ctx);
+    auto mid = std::partition(vec.begin(), vec.end(), [&](Chunk<E> *chunk) {
+      return chunk->to_osec() && (chunk->shdr.sh_flags & SHF_EXECINSTR) &&
+             !ctx.arg.relocatable;
+    });
+
+    // create_range_extension_thunks is not thread-safe
+    for (Chunk<E> *chunk : std::span(vec.begin(), mid))
+      chunk->to_osec()->create_range_extension_thunks(ctx);
+
+    tbb::parallel_for_each(mid, vec.end(), [&](Chunk<E> *chunk) {
+      chunk->compute_section_size(ctx);
     });
   } else {
     tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
@@ -1364,7 +1400,7 @@ void claim_unresolved_symbols(Context<E> &ctx) {
   Timer t(ctx, "claim_unresolved_symbols");
 
   tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
-    if (!file->is_alive)
+    if (file == ctx.internal_obj)
       return;
 
     for (i64 i = file->first_global; i < file->elf_syms.size(); i++) {
@@ -1381,7 +1417,7 @@ void claim_unresolved_symbols(Context<E> &ctx) {
 
       // If a symbol name is in the form of "foo@version", search for
       // symbol "foo" and check if the symbol has version "version".
-      if (file->has_symver.get(i - file->first_global)) {
+      if (file->has_symver[i - file->first_global]) {
         std::string_view str = file->symbol_strtab.data() + esym.st_name;
         i64 pos = str.find('@');
         assert(pos != str.npos);
@@ -1414,10 +1450,19 @@ void claim_unresolved_symbols(Context<E> &ctx) {
       };
 
       if (esym.is_undef_weak()) {
-        if (ctx.arg.shared && sym.visibility != STV_HIDDEN &&
-            ctx.arg.z_dynamic_undefined_weak) {
+        if (ctx.arg.z_dynamic_undefined_weak && sym.visibility != STV_HIDDEN) {
           // Global weak undefined symbols are promoted to dynamic symbols
-          // when linking a DSO unless `-z nodynamic_undefined_weak` was given.
+          // by default only when linking a DSO. We generally cannot do that
+          // for executables because we may need to create a copy relocation
+          // for a data symbol, but the symbol size is not available for an
+          // unclaimed weak symbol.
+          //
+          // In contrast, GNU ld promotes weak symbols to dynamic ones even
+          // for an executable as long as they don't need copy relocations
+          // (i.e. they need only PLT entries.) That may result in an
+          // inconsistent behavior of a linked program depending on whether
+          // whether its object files were compiled with -fPIC or not. I think
+          // that's bad semantics, so we don't do that.
           claim(true);
         } else {
           // Otherwise, weak undefs are converted to absolute symbols with value 0.
@@ -1550,7 +1595,7 @@ template <typename E>
 void compute_imported_symbol_weakness(Context<E> &ctx) {
   Timer t(ctx, "compute_imported_symbol_weakness");
 
-  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
+  tbb::parallel_for_each(ctx.objs, [](ObjectFile<E> *file) {
     for (i64 i = file->first_global; i < file->elf_syms.size(); i++) {
       const ElfSym<E> &esym = file->elf_syms[i];
       Symbol<E> &sym = *file->symbols[i];
@@ -1705,7 +1750,7 @@ void sort_dynsyms(Context<E> &ctx) {
   // .dynsym.
   if (ctx.gnu_hash) {
     auto first_exported = std::stable_partition(first_global, syms.end(),
-                                                [&](Symbol<E> *sym) {
+                                                [](Symbol<E> *sym) {
       return !sym->is_exported;
     });
 
@@ -1728,7 +1773,7 @@ void sort_dynsyms(Context<E> &ctx) {
   }
 
   // Compute .dynstr size
-  ctx.dynstr->dynsym_offset = ctx.dynstr->shdr.sh_size;
+  ctx.dynsym->dynstr_offset = ctx.dynstr->shdr.sh_size;
 
   tbb::enumerable_thread_specific<i64> size;
   tbb::parallel_for((i64)1, (i64)syms.size(), [&](i64 i) {
@@ -1746,6 +1791,14 @@ template <typename E>
 void create_output_symtab(Context<E> &ctx) {
   Timer t(ctx, "compute_symtab_size");
 
+  if constexpr (needs_thunk<E>) {
+    i64 n = 0;
+    for (Chunk<E> *chunk : ctx.chunks)
+      if (OutputSection<E> *osec = chunk->to_osec())
+        for (std::unique_ptr<Thunk<E>> &thunk : osec->thunks)
+          thunk->name = "thunk" + std::to_string(n++);
+  }
+
   tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
     chunk->compute_symtab_size(ctx);
   });
@@ -1858,7 +1911,7 @@ void parse_symbol_version(Context<E> &ctx) {
 
     for (i64 i = file->first_global; i < file->elf_syms.size(); i++) {
       // Match VERSION part of symbol foo@VERSION with version definitions.
-      if (!file->has_symver.get(i - file->first_global))
+      if (!file->has_symver[i - file->first_global])
         continue;
 
       Symbol<E> *sym = file->symbols[i];
@@ -1891,7 +1944,7 @@ void parse_symbol_version(Context<E> &ctx) {
       // defined, the default one takes precedence.
       Symbol<E> *sym2 = get_symbol(ctx, sym->name());
       if (sym2->file == file &&
-          !file->has_symver.get(sym2->sym_idx - file->first_global))
+          !file->has_symver[sym2->sym_idx - file->first_global])
         if (sym2->ver_idx == ctx.default_version ||
             (sym2->ver_idx & ~VERSYM_HIDDEN) == (sym->ver_idx & ~VERSYM_HIDDEN))
           sym2->ver_idx = VER_NDX_LOCAL;
@@ -1947,7 +2000,7 @@ void compute_import_export(Context<E> &ctx) {
   // If we are creating an executable, we want to export symbols referenced
   // by DSOs unless they are explicitly marked as local by a version script.
   if (!ctx.arg.shared) {
-    tbb::parallel_for_each(ctx.dsos, [&](SharedFile<E> *file) {
+    tbb::parallel_for_each(ctx.dsos, [](SharedFile<E> *file) {
       for (Symbol<E> *sym : file->symbols) {
         if (sym->file && !sym->file->is_dso && sym->visibility != STV_HIDDEN &&
             sym->ver_idx != VER_NDX_LOCAL) {
@@ -2307,7 +2360,7 @@ void sort_output_sections_by_order(Context<E> &ctx) {
     chunk->sect_order = get_rank(chunk);
 
   // Sort output sections by --section-order
-  sort(ctx.chunks, [&](Chunk<E> *a, Chunk<E> *b) {
+  sort(ctx.chunks, [](Chunk<E> *a, Chunk<E> *b) {
     return a->sect_order < b->sect_order;
   });
 }
@@ -2540,13 +2593,12 @@ static void set_virtual_addresses_by_order(Context<E> &ctx) {
 }
 
 // Returns the smallest integer N that satisfies N >= val and
-// N mod align == skew mod align.
+// N % align == skew % align.
 //
 // Section's file offset must be congruent to its virtual address modulo
 // the page size. We use this function to satisfy that requirement.
 static u64 align_with_skew(u64 val, u64 align, u64 skew) {
-  u64 x = align_down(val, align) + skew % align;
-  return (val <= x) ? x : x + align;
+  return val + ((skew - val) & (align - 1));
 }
 
 // Assign file offsets to output sections.
@@ -2650,9 +2702,9 @@ void compute_section_headers(Context<E> &ctx) {
 
   // Set section indices.
   i64 shndx = 1;
-  for (i64 i = 0; i < ctx.chunks.size(); i++)
-    if (!ctx.chunks[i]->is_header())
-      ctx.chunks[i]->shndx = shndx++;
+  for (Chunk<E> *chunk : ctx.chunks)
+    if (!chunk->is_header())
+      chunk->shndx = shndx++;
 
   if (ctx.symtab && SHN_LORESERVE <= shndx) {
     SymtabShndxSection<E> *sec = new SymtabShndxSection<E>;
@@ -2666,8 +2718,8 @@ void compute_section_headers(Context<E> &ctx) {
   if (ctx.shdr)
     ctx.shdr->shdr.sh_size = shndx * sizeof(ElfShdr<E>);
 
-  // Some types of section header refer other section by index.
-  // Recompute the section header to fill such fields with correct values.
+  // Some types of section header refer to other section by index.
+  // Recompute all section headers to fill such fields with correct values.
   for (Chunk<E> *chunk : ctx.chunks)
     chunk->update_shdr(ctx);
 
@@ -2705,9 +2757,11 @@ i64 set_osec_offsets(Context<E> &ctx) {
 
 template <typename E>
 static i64 get_num_irelative_relocs(Context<E> &ctx) {
-  i64 n = std::count_if(ctx.got->got_syms.begin(), ctx.got->got_syms.end(),
-                        [](Symbol<E> *sym) { return sym->is_ifunc(); });
-  return n + ctx.num_ifunc_dynrels;
+  i64 n = ctx.num_ifunc_dynrels;
+  for (Symbol<E> *sym : ctx.got->got_syms)
+    if (sym->is_ifunc())
+      n++;
+  return n;
 }
 
 template <typename E>
@@ -2767,18 +2821,16 @@ void fix_synthetic_symbols(Context<E> &ctx) {
   // defined in a statically-linked non-relocatable executable because
   // such executable lacks the .dynamic section and thus there's no way
   // to find ifunc relocations other than these symbols.
-  //
-  // We don't want to set values to these symbols if we are creating a
-  // static PIE due to a glibc bug. Static PIE has a dynamic section.
-  // If we set values to these symbols in a static PIE, glibc attempts
-  // to run ifunc initializers twice, with the second attempt with wrong
-  // function addresses, causing a segmentation fault.
   if (ctx.reldyn && ctx.arg.static_ && !ctx.arg.pie) {
     stop(ctx.__rel_iplt_start, ctx.reldyn);
     stop(ctx.__rel_iplt_end, ctx.reldyn);
-
     ctx.__rel_iplt_start->value -=
       get_num_irelative_relocs(ctx) * sizeof(ElfRel<E>);
+  } else {
+    // If the symbols are not ncessary, we turn them to absolute
+    // symbols at address 0.
+    ctx.__rel_iplt_start->origin = 0;
+    ctx.__rel_iplt_end->origin = 0;
   }
 
   // __{init,fini}_array_{start,end}
@@ -2977,6 +3029,46 @@ std::vector<std::span<u8>> get_shards(Context<E> &ctx) {
   return vec;
 }
 
+// Sort dynamic relocations. This is the reason why we do it.
+// Quote from https://www.airs.com/blog/archives/186
+//
+//   The dynamic linker in glibc uses a one element cache when processing
+//   relocs: if a relocation refers to the same symbol as the previous
+//   relocation, then the dynamic linker reuses the value rather than
+//   looking up the symbol again. Thus the dynamic linker gets the best
+//   results if the dynamic relocations are sorted so that all dynamic
+//   relocations for a given dynamic symbol are adjacent.
+//
+//   Other than that, the linker sorts together all relative relocations,
+//   which don't have symbols. Two relative relocations, or two relocations
+//   against the same symbol, are sorted by the address in the output
+//   file. This tends to optimize paging and caching when there are two
+//   references from the same page.
+template <typename E>
+void sort_reldyn(Context<E> &ctx) {
+  Timer t(ctx, "sort_reldyn");
+
+  ElfRel<E> *begin = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset);
+  ElfRel<E> *end = begin + ctx.reldyn->shdr.sh_size / sizeof(ElfRel<E>);
+
+  // We group IFUNC relocations at the end of .rel.dyn because we want to
+  // apply all the other relocations before running user-supplied IFUNC
+  // resolvers.
+  auto get_rank = [](u32 r_type) {
+    if (r_type == E::R_RELATIVE)
+      return 0;
+    if constexpr (supports_ifunc<E>)
+      if (r_type == E::R_IRELATIVE)
+        return 2;
+    return 1;
+  };
+
+  tbb::parallel_sort(begin, end, [&](const ElfRel<E> &a, const ElfRel<E> &b) {
+    return std::tuple(get_rank(a.r_type), a.r_sym, a.r_offset) <
+           std::tuple(get_rank(b.r_type), b.r_sym, b.r_offset);
+  });
+}
+
 template <typename E>
 void write_build_id(Context<E> &ctx) {
   Timer t(ctx, "write_build_id");
@@ -2997,7 +3089,7 @@ void write_build_id(Context<E> &ctx) {
       // Make the kernel page out the file contents we've just written
       // so that subsequent close(2) call will become quicker.
       if (i > 0 && ctx.output_file->is_mmapped)
-        madvise(begin, end - begin, MADV_DONTNEED);
+        madvise(shards[i].data(), shards[i].size(), MADV_DONTNEED);
 #endif
     });
 
@@ -3101,6 +3193,10 @@ void write_separate_debug_file(Context<E> &ctx) {
 
   append(ctx.chunks, ctx.debug_chunks);
 
+  // Handle --compress-debug-info
+  if (ctx.arg.compress_debug_sections != COMPRESS_NONE)
+    compress_debug_sections(ctx);
+
   // Write to the debug info file as if it were a regular output file.
   compute_section_headers(ctx);
   file->resize(ctx, set_osec_offsets(ctx));
@@ -3209,6 +3305,14 @@ void show_stats(Context<E> &ctx) {
   static Counter num_objs("num_objs", ctx.objs.size());
   static Counter num_dsos("num_dsos", ctx.dsos.size());
 
+  using Entry = typename ConcurrentMap<SectionFragment<E>>::Entry;
+
+  static Counter merged_strings("merged_strings");
+  for (std::unique_ptr<MergedSection<E>> &sec : ctx.merged_sections)
+    for (Entry &ent : std::span(sec->map.entries, sec->map.nbuckets))
+      if (ent.key)
+        merged_strings++;
+
   if constexpr (needs_thunk<E>) {
     static Counter thunk_bytes("thunk_bytes");
     for (Chunk<E> *chunk : ctx.chunks)
@@ -3217,6 +3321,15 @@ void show_stats(Context<E> &ctx) {
           thunk_bytes += thunk->size();
   }
 
+  if constexpr (is_riscv<E> || is_loongarch<E>) {
+    static Counter num_rels("shrunk_relocs");
+    for (Chunk<E> *chunk : ctx.chunks)
+      if (OutputSection<E> *osec = chunk->to_osec())
+        if (osec->shdr.sh_flags & SHF_EXECINSTR)
+          for (InputSection<E> *isec : osec->members)
+            num_rels += isec->extra.r_deltas.size();
+  }
+
   Counter::print();
 
   for (std::unique_ptr<MergedSection<E>> &sec : ctx.merged_sections)
@@ -3267,6 +3380,7 @@ template void compute_section_headers(Context<E> &);
 template i64 set_osec_offsets(Context<E> &);
 template void fix_synthetic_symbols(Context<E> &);
 template void compress_debug_sections(Context<E> &);
+template void sort_reldyn(Context<E> &);
 template void write_build_id(Context<E> &);
 template void write_gnu_debuglink(Context<E> &);
 template void write_separate_debug_file(Context<E> &);
diff --git a/src/relocatable.cc b/src/relocatable.cc
index 639dc6ae87..6b12c027db 100644
--- a/src/relocatable.cc
+++ b/src/relocatable.cc
@@ -53,7 +53,9 @@ static void r_create_synthetic_sections(Context<E> &ctx) {
   ctx.strtab = push(new StrtabSection<E>);
   ctx.symtab = push(new SymtabSection<E>);
   ctx.shstrtab = push(new ShstrtabSection<E>);
-  ctx.note_property = push(new NotePropertySection<E>);
+
+  if constexpr (is_x86<E>)
+    ctx.extra.note_property = push(new NotePropertySection<E>);
 
   if constexpr (is_riscv<E>)
     ctx.extra.riscv_attributes = push(new RiscvAttributesSection<E>);
@@ -109,10 +111,7 @@ template <typename E>
 static void r_claim_unresolved_symbols(Context<E> &ctx) {
   Timer t(ctx, "r_claim_unresolved_symbols");
 
-  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
-    if (!file->is_alive)
-      return;
-
+  tbb::parallel_for_each(ctx.objs, [](ObjectFile<E> *file) {
     for (i64 i = file->first_global; i < file->elf_syms.size(); i++) {
       const ElfSym<E> &esym = file->elf_syms[i];
       Symbol<E> &sym = *file->symbols[i];
diff --git a/src/shrink-sections.cc b/src/shrink-sections.cc
index cfd3f4b37b..f6684cbb86 100644
--- a/src/shrink-sections.cc
+++ b/src/shrink-sections.cc
@@ -66,21 +66,20 @@ namespace mold {
 
 using E = MOLD_TARGET;
 
-static bool is_resizable(InputSection<E> *isec) {
-  return isec && isec->is_alive && (isec->shdr().sh_flags & SHF_ALLOC) &&
-         (isec->shdr().sh_flags & SHF_EXECINSTR);
+template <>
+i64 get_r_delta(InputSection<E> &isec, u64 offset) {
+  std::span<RelocDelta> deltas = isec.extra.r_deltas;
+  auto it = std::upper_bound(deltas.begin(), deltas.end(), offset,
+                             [](u64 val, const RelocDelta &x) {
+    return val <= x.offset;
+  });
+  return (it == deltas.begin()) ? 0 : (it - 1)->delta;
 }
 
 template <>
-void shrink_sections<E>(Context<E> &ctx) {
+void shrink_sections(Context<E> &ctx) {
   Timer t(ctx, "shrink_sections");
 
-  // True if we can use the 2-byte instructions. This is usually true on
-  // Unix because RV64GC is generally considered the baseline hardware.
-  bool use_rvc = false;
-  if constexpr (is_riscv<E>)
-    use_rvc = get_eflags(ctx) & EF_RISCV_RVC;
-
   // Find all relaxable relocations and record how many bytes we can save
   // into r_deltas.
   //
@@ -93,28 +92,17 @@ void shrink_sections<E>(Context<E> &ctx) {
   // them. We scan relocations only once here.
   tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
     for (std::unique_ptr<InputSection<E>> &isec : file->sections)
-      if (is_resizable(isec.get()))
-        shrink_section(ctx, *isec, use_rvc);
+      if (isec && isec->is_alive && (isec->shdr().sh_flags & SHF_EXECINSTR))
+        shrink_section(ctx, *isec);
   });
 
   // Fix symbol values.
   tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
-    for (Symbol<E> *sym : file->symbols) {
-      if (sym->file != file)
-        continue;
-
-      InputSection<E> *isec = sym->get_input_section();
-      if (!isec || isec->extra.r_deltas.empty())
-        continue;
-
-      std::span<const ElfRel<E>> rels = isec->get_rels(ctx);
-      auto it = std::lower_bound(rels.begin(), rels.end(), sym->value,
-                                 [&](const ElfRel<E> &r, u64 val) {
-        return r.r_offset < val;
-      });
-
-      sym->value -= isec->extra.r_deltas[it - rels.begin()];
-    }
+    for (Symbol<E> *sym : file->symbols)
+      if (sym->file == file)
+        if (InputSection<E> *isec = sym->get_input_section())
+          if (i64 delta = get_r_delta(*isec, sym->value))
+            sym->value -= delta;
   });
 
   // Recompute sizes of executable sections
@@ -126,8 +114,8 @@ void shrink_sections<E>(Context<E> &ctx) {
 
 // Returns the distance between a relocated place and a symbol.
 template <>
-i64 compute_distance<E>(Context<E> &ctx, Symbol<E> &sym,
-                        InputSection<E> &isec, const ElfRel<E> &rel) {
+i64 compute_distance(Context<E> &ctx, Symbol<E> &sym,
+                     InputSection<E> &isec, const ElfRel<E> &rel) {
   // We handle absolute symbols as if they were infinitely far away
   // because `shrink_section` may increase a distance between a branch
   // instruction and an absolute symbol. Branching to an absolute
diff --git a/src/subprocess-unix.cc b/src/subprocess-unix.cc
index 4539b34672..da51f8995e 100644
--- a/src/subprocess-unix.cc
+++ b/src/subprocess-unix.cc
@@ -1,8 +1,7 @@
 #include "mold.h"
-#include "config.h"
 
+#include <csignal>
 #include <filesystem>
-#include <signal.h>
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <sys/types.h>
@@ -170,8 +169,8 @@ void process_run_subcommand(Context<E> &ctx, int argc, char **argv) {
   //
   //
   // Set environment variables
-  putenv(strdup(("LD_PRELOAD=" + dso_path).c_str()));
-  putenv(strdup(("MOLD_PATH=" + self).c_str()));
+  setenv("LD_PRELOAD", dso_path.c_str(), 1);
+  setenv("MOLD_PATH", self.c_str(), 1);
 
   // argv[0]: mold, argv[1]: -run/--run
   //
diff --git a/src/thunks.cc b/src/thunks.cc
index 731a118584..499b08aedc 100644
--- a/src/thunks.cc
+++ b/src/thunks.cc
@@ -20,7 +20,8 @@
 // we don't need to try too hard to reduce thunk size to the absolute
 // minimum.
 
-#if MOLD_ARM32 || MOLD_ARM64 || MOLD_PPC32 || MOLD_PPC64V1 || MOLD_PPC64V2
+#if MOLD_ARM32 || MOLD_ARM64LE || MOLD_ARM64BE || MOLD_PPC32 || \
+    MOLD_PPC64V1 || MOLD_PPC64V2
 
 #include "mold.h"
 
@@ -31,39 +32,17 @@ namespace mold {
 
 using E = MOLD_TARGET;
 
-// Returns a branch reach in bytes for a given target.
-static consteval i64 max_distance() {
-  // ARM64's branch has 26 bits immediate. The immediate is padded with
-  // implicit two-bit zeros because all instructions are 4 bytes aligned
-  // and therefore the least two bits are always zero. So the branch
-  // operand is effectively 28 bits long. That means the branch range is
-  // [-2^27, 2^27) or PC ± 128 MiB.
-  if (is_arm64<E>)
-    return 1 << 27;
-
-  // ARM32's Thumb branch has 24 bits immediate, and the instructions are
-  // aligned to 2, so it's effectively 25 bits. It's [-2^24, 2^24) or PC ±
-  // 16 MiB.
-  //
-  // ARM32's non-Thumb branches have twice longer range than its Thumb
-  // counterparts, but we conservatively use the Thumb's limitation.
-  if (is_arm32<E>)
-    return 1 << 24;
-
-  // PPC's branch has 24 bits immediate, and the instructions are aligned
-  // to 4, therefore the reach is [-2^25, 2^25) or PC ± 32 MiB.
-  assert(is_ppc<E>);
-  return 1 << 25;
-}
-
-// We create thunks for each 12.8/1.6/3.2 MiB code block for
+// We create thunks for each 25.6/3.2/6.4 MiB code block for
 // ARM64/ARM32/PPC, respectively.
-static constexpr i64 batch_size = max_distance() / 10;
+static constexpr i64 batch_size = branch_distance<E> / 5;
 
-// We assume that a single thunk group is smaller than 900 KiB.
-static constexpr i64 max_thunk_size = 900 * 1024;
+// We assume that a single thunk group is smaller than 1 MiB.
+static constexpr i64 max_thunk_size = 1024 * 1024;
 
-static_assert(max_thunk_size / E::thunk_size < ThunkRef::MAX_SYM_IDX);
+// We align thunks to 16 byte boundaries because many processor vendors
+// recommend we align branch targets to 16 byte boundaries for performance
+// reasons.
+static constexpr i64 thunk_align = 16;
 
 template <typename E>
 static bool is_reachable(Context<E> &ctx, InputSection<E> &isec,
@@ -113,56 +92,12 @@ static bool is_reachable(Context<E> &ctx, InputSection<E> &isec,
   i64 A = get_addend(isec, rel);
   i64 P = isec.get_addr() + rel.r_offset;
   i64 val = S + A - P;
-  return -max_distance() <= val && val < max_distance();
+  return -branch_distance<E> <= val && val < branch_distance<E>;
 }
 
-static void reset_thunk(Thunk<E> &thunk) {
-  for (Symbol<E> *sym : thunk.symbols) {
-    sym->extra.thunk_idx = -1;
-    sym->extra.thunk_sym_idx = -1;
+static void reset(Thunk<E> &thunk) {
+  for (Symbol<E> *sym : thunk.symbols)
     sym->flags = 0;
-  }
-}
-
-// Scan relocations to collect symbols that need thunks.
-static void scan_rels(Context<E> &ctx, InputSection<E> &isec,
-                      Thunk<E> &thunk, i64 thunk_idx) {
-  std::span<const ElfRel<E>> rels = isec.get_rels(ctx);
-  std::vector<ThunkRef> &thunk_refs = isec.extra.thunk_refs;
-  thunk_refs.resize(rels.size());
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (!is_func_call_rel(rel))
-      continue;
-
-    // Skip if the symbol is undefined. apply_reloc() will report an error.
-    Symbol<E> &sym = *isec.file.symbols[rel.r_sym];
-    if (!sym.file)
-      continue;
-
-    // Skip if the destination is within reach.
-    if (is_reachable(ctx, isec, sym, rel))
-      continue;
-
-    // This relocation needs a thunk. If the symbol is already in a
-    // previous thunk, reuse it.
-    if (sym.extra.thunk_idx != -1) {
-      thunk_refs[i].thunk_idx = sym.extra.thunk_idx;
-      thunk_refs[i].sym_idx = sym.extra.thunk_sym_idx;
-      continue;
-    }
-
-    // Otherwise, add the symbol to the current thunk if it's not
-    // added already.
-    thunk_refs[i].thunk_idx = thunk_idx;
-    thunk_refs[i].sym_idx = -1;
-
-    if (sym.flags.exchange(-1) == 0) {
-      std::scoped_lock lock(thunk.mu);
-      thunk.symbols.push_back(&sym);
-    }
-  }
 }
 
 template <>
@@ -188,14 +123,14 @@ void OutputSection<E>::create_range_extension_thunks(Context<E> &ctx) {
   // from the current batch.
   //
   // D is the input section with the largest address such that the thunk
-  // is reachable from the current batch if it's inserted right before D.
+  // is reachable from the current batch if it's inserted at D.
   //
   //  ................................ <input sections> ............
   //     A    B    C    D
   //                    ^ We insert a thunk for the current batch just before D
   //          <--->       The current batch, which is smaller than BATCH_SIZE
-  //     <-------->       Smaller than MAX_DISTANCE
-  //          <-------->  Smaller than MAX_DISTANCE
+  //     <-------->       Smaller than BRANCH_DISTANCE
+  //          <-------->  Smaller than BRANCH_DISTANCE
   //     <------------->  Reachable from the current batch
   i64 a = 0;
   i64 b = 0;
@@ -210,11 +145,11 @@ void OutputSection<E>::create_range_extension_thunks(Context<E> &ctx) {
     // Move D foward as far as we can jump from B to a thunk at D.
     auto d_thunk_end = [&] {
       u64 d_end = align_to(offset, 1 << m[d]->p2align) + m[d]->sh_size;
-      return align_to(d_end, Thunk<E>::alignment) + max_thunk_size;
+      return align_to(d_end, thunk_align) + max_thunk_size;
     };
 
     while (d < m.size() &&
-           (b == d || d_thunk_end() <= m[b]->offset + max_distance())) {
+           (b == d || d_thunk_end() <= m[b]->offset + branch_distance<E>)) {
       offset = align_to(offset, 1 << m[d]->p2align);
       m[d]->offset = offset;
       offset += m[d]->sh_size;
@@ -229,65 +164,98 @@ void OutputSection<E>::create_range_extension_thunks(Context<E> &ctx) {
       c++;
 
     // Move A forward so that A is reachable from C.
-    i64 c_offset = (c == m.size()) ? offset : m[c]->offset;
-    while (a < b && m[a]->offset + max_distance() < c_offset)
+    i64 c_offset = (c == d) ? offset : m[c]->offset;
+    while (a < b && m[a]->offset + branch_distance<E> < c_offset)
       a++;
 
     // Erase references to out-of-range thunks.
     while (t < thunks.size() && thunks[t]->offset < m[a]->offset)
-      reset_thunk(*thunks[t++]);
+      reset(*thunks[t++]);
 
     // Create a new thunk and place it at D.
-    offset = align_to(offset, Thunk<E>::alignment);
-    i64 thunk_idx = thunks.size();
-    Thunk<E> *thunk = new Thunk<E>(*this, offset);
-    thunks.emplace_back(thunk);
+    offset = align_to(offset, thunk_align);
+    thunks.emplace_back(std::make_unique<Thunk<E>>(*this, offset));
+    Thunk<E> &thunk = *thunks.back();
 
     // Scan relocations between B and C to collect symbols that need
     // entries in the new thunk.
-    for (i64 i = b; i < c; i++)
-      scan_rels(ctx, *m[i], *thunk, thunk_idx);
-
-    // Now that we know the number of symbols in the thunk, we can compute
-    // the thunk's size.
-    assert(thunk->size() < max_thunk_size);
-    offset += thunk->size();
+    std::mutex mu;
+
+    tbb::parallel_for(b, c, [&](i64 i) {
+      InputSection<E> &isec = *m[i];
+
+      for (const ElfRel<E> &rel : isec.get_rels(ctx)) {
+        if (!is_func_call_rel(rel))
+          continue;
+
+        // Skip if the symbol is undefined. apply_reloc() will report an error.
+        Symbol<E> &sym = *isec.file.symbols[rel.r_sym];
+        if (!sym.file)
+          continue;
+
+        // Skip if the destination is within reach.
+        if (is_reachable(ctx, isec, sym, rel))
+          continue;
+
+        // Add the symbol to the current thunk if it's not added already
+        // by other thread.
+        if (!sym.flags.test_and_set()) {
+          std::scoped_lock lock(mu);
+          thunk.symbols.push_back(&sym);
+        }
+      }
+    });
 
     // Sort symbols added to the thunk to make the output deterministic.
-    sort(thunk->symbols, [](Symbol<E> *a, Symbol<E> *b) {
+    sort(thunk.symbols, [](Symbol<E> *a, Symbol<E> *b) {
       return std::tuple{a->file->priority, a->sym_idx} <
              std::tuple{b->file->priority, b->sym_idx};
     });
 
-    // Assign offsets within the thunk to the symbols.
-    for (i64 i = 0; Symbol<E> *sym : thunk->symbols) {
-      sym->extra.thunk_idx = thunk_idx;
-      sym->extra.thunk_sym_idx = i++;
-    }
-
-    // Scan relocations again to fix symbol offsets in the last thunk.
-    for (i64 i = b; i < c; i++) {
-      std::span<Symbol<E> *> syms = m[i]->file.symbols;
-      std::span<const ElfRel<E>> rels = m[i]->get_rels(ctx);
-      std::span<ThunkRef> thunk_refs = m[i]->extra.thunk_refs;
-
-      for (i64 j = 0; j < rels.size(); j++)
-        if (thunk_refs[j].thunk_idx == thunk_idx)
-          thunk_refs[j].sym_idx = syms[rels[j].r_sym]->extra.thunk_sym_idx;
-    }
+    // Now that we know the number of symbols in the thunk, we can compute
+    // the thunk's size.
+    assert(thunk.size() < max_thunk_size);
+    offset += thunk.size();
 
     // Move B forward to point to the begining of the next batch.
     b = c;
   }
 
   while (t < thunks.size())
-    reset_thunk(*thunks[t++]);
-
+    reset(*thunks[t++]);
   this->shdr.sh_size = offset;
+}
 
-  for (InputSection<E> *isec : members)
-    this->shdr.sh_addralign =
-      std::max<u32>(this->shdr.sh_addralign, 1 << isec->p2align);
+// When applying relocations, we want to know the address in a reachable
+// range extension thunk for a given symbol. Doing it by scanning all
+// reachable range extension thunks is too expensive.
+//
+// In this function, we create a list of all addresses in range extension
+// thunks for each symbol, so that it is easy to find one.
+//
+// Note that thunk_addrs must be sorted for binary search.
+template <>
+void gather_thunk_addresses(Context<E> &ctx) {
+  Timer t(ctx, "gather_thunk_addresses");
+
+  std::vector<OutputSection<E> *> sections;
+  for (Chunk<E> *chunk : ctx.chunks)
+    if (OutputSection<E> *osec = chunk->to_osec())
+      sections.push_back(osec);
+
+  sort(sections, [](OutputSection<E> *a, OutputSection<E> *b) {
+    return a->shdr.sh_addr < b->shdr.sh_addr;
+  });
+
+  for (OutputSection<E> *osec : sections) {
+    for (std::unique_ptr<Thunk<E>> &thunk : osec->thunks) {
+      for (i64 i = 0; i < thunk->symbols.size(); i++) {
+        Symbol<E> &sym = *thunk->symbols[i];
+        sym.add_aux(ctx);
+        ctx.symbol_aux[sym.aux_idx].thunk_addrs.push_back(thunk->get_addr(i));
+      }
+    }
+  }
 }
 
 } // namespace mold
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index e64a1f0ee0..9a565b2cf7 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -18,7 +18,7 @@ endif()
 
 if(${MACHINE} MATCHES "amd64")
   set(MACHINE x86_64)
-elseif(${MACHINE} MATCHES "i386")
+elseif(${MACHINE} MATCHES "i.86")
   set(MACHINE i686)
 elseif(${MACHINE} MATCHES "arm.*")
   set(MACHINE arm)
@@ -30,13 +30,14 @@ endif()
 
 if(MOLD_ENABLE_QEMU_TESTS)
   list(APPEND QEMU_ARCHS
-    x86_64 i386 arm aarch64 ppc ppc64 ppc64le sparc64 sh4 s390x
-    riscv64 riscv32 m68k loongarch64)
+    x86_64 i386 arm aarch64 aarch64_be ppc ppc64 ppc64le sparc64 sh4 sh4eb
+    s390x riscv64 riscv32 m68k loongarch64)
 
   LIST(APPEND TRIPLES
     x86_64-linux-gnu
     i686-linux-gnu
     aarch64-linux-gnu
+    aarch64_be-linux-gnu
     arm-linux-gnueabihf
     riscv64-linux-gnu
     powerpc-linux-gnu
@@ -45,6 +46,7 @@ if(MOLD_ENABLE_QEMU_TESTS)
     sparc64-linux-gnu
     s390x-linux-gnu
     sh4-linux-gnu
+    sh4aeb-linux-gnu
     riscv32-linux-gnu
     m68k-linux-gnu
     loongarch64-linux-gnu)
@@ -111,6 +113,10 @@ if(${MACHINE} STREQUAL "aarch64" OR (HAS_qemu-aarch64 AND HAS_aarch64-linux-gnu-
   add_target(aarch64 aarch64-linux-gnu)
 endif()
 
+if(${MACHINE} STREQUAL "aarch64_be" OR (HAS_qemu-aarch64_be AND HAS_aarch64_be-linux-gnu-gcc))
+  add_target(aarch64_be aarch64_be-linux-gnu)
+endif()
+
 if(${MACHINE} STREQUAL "arm" OR (HAS_qemu-arm AND HAS_arm-linux-gnueabihf-gcc))
   add_target(arm arm-linux-gnueabihf)
 endif()
@@ -167,6 +173,10 @@ if(${MACHINE} STREQUAL "sh4" OR (HAS_qemu-sh4 AND HAS_sh4-linux-gnu-gcc))
   add_target(sh4 sh4-linux-gnu)
 endif()
 
+if(${MACHINE} STREQUAL "sh4aeb" OR (HAS_qemu-sh4eb AND HAS_sh4aeb-linux-gnu-gcc))
+  add_target(sh4aeb sh4aeb-linux-gnu)
+endif()
+
 if(${MACHINE} STREQUAL "m68k" OR (HAS_qemu-m68k AND HAS_m68k-linux-gnu-gcc))
   add_target(m68k m68k-linux-gnu)
 endif()
diff --git a/test/abs-error.sh b/test/abs-error.sh
index 65499c3101..7069f72cc7 100755
--- a/test/abs-error.sh
+++ b/test/abs-error.sh
@@ -18,5 +18,5 @@ extern char foo;
 int main() { printf("foo=%p\n", &foo); }
 EOF
 
-! $CC -B. -o $t/exe -pie $t/a.o $t/b.o -Wl,-z,text >& $t/log
-grep -q 'recompile with -fPIC' $t/log
+not $CC -B. -o $t/exe -pie $t/a.o $t/b.o -Wl,-z,text |&
+  grep 'recompile with -fPIC'
diff --git a/test/absolute-symbols.sh b/test/absolute-symbols.sh
index febd3e36b3..304f8e0d94 100755
--- a/test/absolute-symbols.sh
+++ b/test/absolute-symbols.sh
@@ -36,4 +36,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe -no-pie $t/a.o $t/b.o
-$QEMU $t/exe | grep -q '^ip=0xa0000.$'
+$QEMU $t/exe | grep '^ip=0xa0000.$'
diff --git a/test/allow-multiple-definition.sh b/test/allow-multiple-definition.sh
index e7e0842f51..d61a30a1fd 100755
--- a/test/allow-multiple-definition.sh
+++ b/test/allow-multiple-definition.sh
@@ -4,6 +4,6 @@
 echo 'int main() { return 0; }' | $CC -c -o $t/a.o -xc -
 echo 'int main() { return 1; }' | $CC -c -o $t/b.o -xc -
 
-! $CC -B. -o $t/exe $t/a.o $t/b.o 2> /dev/null || false
+not $CC -B. -o $t/exe $t/a.o $t/b.o 2> /dev/null
 $CC -B. -o $t/exe $t/a.o $t/b.o -Wl,-allow-multiple-definition
 $CC -B. -o $t/exe $t/a.o $t/b.o -Wl,-z,muldefs
diff --git a/test/arch-aarch64-range-extension-thunk-disassembly.sh b/test/arch-aarch64-range-extension-thunk-disassembly.sh
index 4c7c5fce12..02aec7fac4 100755
--- a/test/arch-aarch64-range-extension-thunk-disassembly.sh
+++ b/test/arch-aarch64-range-extension-thunk-disassembly.sh
@@ -18,4 +18,4 @@ EOF
 $CC -B. -o $t/exe $t/a.o \
   -Wl,--section-start=.low=0x10000000,--section-start=.high=0x20000000
 
-$OBJDUMP -dr $t/exe | grep -Fq '<fn1$thunk>:'
+$OBJDUMP -dr $t/exe | grep -E '<fn1\$thunk[0-9]+>:'
diff --git a/test/arch-aarch64-variant-pcs.sh b/test/arch-aarch64-variant-pcs.sh
index c434cfa2f9..283d1c4e4a 100755
--- a/test/arch-aarch64-variant-pcs.sh
+++ b/test/arch-aarch64-variant-pcs.sh
@@ -10,7 +10,7 @@ foo:
 EOF
 
 $CC -B. -shared -o $t/b.so $t/a.o
-readelf -W --dyn-syms $t/b.so | grep foo | grep -Fq '[VARIANT_PCS]'
+readelf -W --dyn-syms $t/b.so | grep foo | grep -F '[VARIANT_PCS]'
 
 cat <<EOF | $CC -c -o $t/c.o -xc -
 void foo();
@@ -18,4 +18,4 @@ int main() { foo(); }
 EOF
 
 $CC -B. -o $t/exe $t/c.o $t/b.so
-readelf -W --dynamic $t/exe | grep -q AARCH64_VARIANT_PCS
+readelf -W --dynamic $t/exe | grep AARCH64_VARIANT_PCS
diff --git a/test/arch-arm-abs-error.sh b/test/arch-arm-abs-error.sh
index 3a79c43c71..b4b9e7c9ad 100755
--- a/test/arch-arm-abs-error.sh
+++ b/test/arch-arm-abs-error.sh
@@ -14,5 +14,5 @@ EOF
 
 $CC -o $t/exe -pie $t/a.o $t/b.o >& /dev/null && skip
 
-! $CC -B. -o $t/exe -pie $t/a.o $t/b.o >& $t/log
-grep -q 'recompile with -fPIC' $t/log
+not $CC -B. -o $t/exe -pie $t/a.o $t/b.o |&
+  grep 'recompile with -fPIC'
diff --git a/test/arch-arm-exidx-sentinel.sh b/test/arch-arm-exidx-sentinel.sh
new file mode 100755
index 0000000000..eb41f63ebd
--- /dev/null
+++ b/test/arch-arm-exidx-sentinel.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CXX -o $t/a.o -c -xc++ - -fno-PIC
+#include <stdio.h>
+extern char _etext[];
+int main() {
+  printf("%p\n", _etext);
+}
+EOF
+
+$CXX -B. -o $t/exe $t/a.o -no-pie
+readelf --unwind $t/exe | grep "$($QEMU $t/exe) .*cantunwind"
diff --git a/test/arch-arm-range-extension-thunk-disassembly.sh b/test/arch-arm-range-extension-thunk-disassembly.sh
index b1f28d793e..c84d6c1c03 100755
--- a/test/arch-arm-range-extension-thunk-disassembly.sh
+++ b/test/arch-arm-range-extension-thunk-disassembly.sh
@@ -18,7 +18,7 @@ EOF
 $CC -B. -o $t/exe $t/a.o \
   -Wl,--section-start=.low=0x10000000,--section-start=.high=0x20000000
 
-$OBJDUMP -dr $t/exe | grep -F -A7 '<fn1$thunk>:' > $t/log
+$OBJDUMP -dr $t/exe | grep -E -A7 '<fn1\$thunk[0-9]+>:' > $t/log
 
-grep -Eq 'bx\s+pc' $t/log
-grep -Eq 'add\s+pc, ip, pc' $t/log
+grep -E 'bx\s+pc' $t/log
+grep -E 'add\s+pc, ip, pc' $t/log
diff --git a/test/arch-arm-range-extension-thunk.sh b/test/arch-arm-range-extension-thunk.sh
index 191b1c51fc..fe7f1d3e5c 100755
--- a/test/arch-arm-range-extension-thunk.sh
+++ b/test/arch-arm-range-extension-thunk.sh
@@ -1,8 +1,7 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
-echo 'int main() {}' | $CC -c -o /dev/null -xc - -O0 -mthumb >& /dev/null \
-  || skip
+test_cflags -mthumb || skip
 
 cat <<EOF > $t/a.c
 #include <stdio.h>
@@ -35,11 +34,11 @@ $CC -c -o $t/d.o $t/b.c -O0 -marm
 
 $CC -B. -o $t/exe $t/c.o $t/d.o \
   -Wl,--section-start=.low=0x10000000,--section-start=.high=0x20000000
-$QEMU $t/exe | grep -q 'main fn1 fn3 fn2 fn4'
+$QEMU $t/exe | grep 'main fn1 fn3 fn2 fn4'
 
 $CC -c -o $t/e.o $t/a.c -O2 -mthumb
 $CC -c -o $t/f.o $t/b.c -O2 -marm
 
 $CC -B. -o $t/exe $t/e.o $t/f.o \
   -Wl,--section-start=.low=0x10000000,--section-start=.high=0x20000000
-$QEMU $t/exe | grep -q 'main fn1 fn3 fn2 fn4'
+$QEMU $t/exe | grep 'main fn1 fn3 fn2 fn4'
diff --git a/test/arch-arm-target1.sh b/test/arch-arm-target1.sh
index fffdd9340f..baf1f366af 100755
--- a/test/arch-arm-target1.sh
+++ b/test/arch-arm-target1.sh
@@ -16,4 +16,4 @@ int main() { printf("%s\n", foo); }
 EOF
 
 $CC -B. -o $t/exe -pie $t/a.o $t/b.o
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
diff --git a/test/arch-arm-thm-jump19.sh b/test/arch-arm-thm-jump19.sh
new file mode 100755
index 0000000000..3dc28a77bf
--- /dev/null
+++ b/test/arch-arm-thm-jump19.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -o $t/a.o -c -xassembler -
+.syntax unified
+.globl bar
+.thumb
+bar:
+ beq foo-2
+EOF
+
+cat <<EOF | $CC -o $t/b.o -c -xassembler -
+.globl foo, baz
+.thumb
+baz:
+ nop
+foo:
+ nop
+EOF
+
+cat <<EOF | $CC -o $t/c.o -c -xc -
+void bar();
+int main() { bar(); }
+EOF
+
+$CC -B. -o $t/exe $t/a.o $t/b.o $t/c.o
+$OBJDUMP -d $t/exe > $t/log
+grep -E 'beq\.w.*<baz>' $t/log
diff --git a/test/arch-arm-thumb-interwork.sh b/test/arch-arm-thumb-interwork.sh
index aed7b236ac..ccb87b2691 100755
--- a/test/arch-arm-thumb-interwork.sh
+++ b/test/arch-arm-thumb-interwork.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
-echo 'int foo() { return 0; }' | $CC -o /dev/null -c -xc - -mthumb 2> /dev/null || skip
+test_cflags -mthumb || skip
 
 cat <<EOF | $CC -o $t/a.o -c -xc - -mthumb
 #include <stdio.h>
@@ -27,4 +27,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o $t/b.o
-$QEMU $t/exe | grep -q 'main foo bar'
+$QEMU $t/exe | grep 'main foo bar'
diff --git a/test/arch-arm-tlsdesc.sh b/test/arch-arm-tlsdesc.sh
index f8a2e5a3e7..a492f9d107 100755
--- a/test/arch-arm-tlsdesc.sh
+++ b/test/arch-arm-tlsdesc.sh
@@ -2,9 +2,7 @@
 . $(dirname $0)/common.inc
 
 is_musl && skip
-
-echo 'int main() {}' | $GCC -c -o /dev/null -xc - -O0 -mthumb >& /dev/null \
-  || skip
+test_cflags -mthumb || skip
 
 cat <<EOF > $t/a.c
 extern _Thread_local int foo;
@@ -42,24 +40,33 @@ $GCC -fPIC -mtls-dialect=gnu2 -c -o $t/c.o $t/a.c -marm
 $GCC -fPIC -mtls-dialect=gnu2 -c -o $t/d.o $t/b.c -marm
 
 $CC -B. -o $t/exe1 $t/c.o $t/d.o
-$QEMU $t/exe1 | grep -q '42 5'
+$QEMU $t/exe1 | grep '42 5'
 
 $CC -B. -o $t/exe2 $t/c.o $t/d.o -Wl,-no-relax
-$QEMU $t/exe2 | grep -q '42 5'
+$QEMU $t/exe2 | grep '42 5'
 
 $CC -B. -o $t/exe3 $t/c.o $t/d.o -Wl,-no-relax \
   -Wl,--section-start=.low=0x10000000,--section-start=.high=0x20000000
-$QEMU $t/exe3 | grep -q '42 5'
+$QEMU $t/exe3 | grep '42 5'
+
+$GCC -B. -shared -o $t/c.so $t/c.o -Wl,-z,nodlopen
+$CC -B. -o $t/exe4 $t/c.so $t/d.o
+$QEMU $t/exe4 | grep '42 5'
+
 
 $GCC -fPIC -mtls-dialect=gnu2 -c -o $t/e.o $t/a.c -mthumb
 $GCC -fPIC -mtls-dialect=gnu2 -c -o $t/f.o $t/b.c -mthumb
 
-$CC -B. -o $t/exe4 $t/e.o $t/f.o
-$QEMU $t/exe4 | grep -q '42 5'
+$CC -B. -o $t/exe5 $t/e.o $t/f.o
+$QEMU $t/exe5 | grep '42 5'
 
-$CC -B. -o $t/exe5 $t/e.o $t/f.o -Wl,-no-relax
-$QEMU $t/exe5 | grep -q '42 5'
+$CC -B. -o $t/exe6 $t/e.o $t/f.o -Wl,-no-relax
+$QEMU $t/exe6 | grep '42 5'
 
-$CC -B. -o $t/exe6 $t/e.o $t/f.o -Wl,-no-relax \
+$CC -B. -o $t/exe7 $t/e.o $t/f.o -Wl,-no-relax \
   -Wl,--section-start=.low=0x10000000,--section-start=.high=0x20000000
-$QEMU $t/exe6 | grep -q '42 5'
+$QEMU $t/exe7 | grep '42 5'
+
+$GCC -B. -shared -o $t/e.so $t/e.o -Wl,-z,nodlopen
+$CC -B. -o $t/exe8 $t/e.so $t/f.o
+$QEMU $t/exe8 | grep '42 5'
diff --git a/test/arch-i686-tls-module-base.sh b/test/arch-i686-tls-module-base.sh
index 2e906ca7b0..fa07f53eca 100755
--- a/test/arch-i686-tls-module-base.sh
+++ b/test/arch-i686-tls-module-base.sh
@@ -37,11 +37,11 @@ int main() {
 EOF
 
 $CC -o $t/exe1 $t/a.o $t/b.o $t/c.o -pie
-$QEMU $t/exe1 | grep -q '^20 3$'
+$QEMU $t/exe1 | grep '^20 3$'
 
 $CC -o $t/exe2 $t/a.o $t/b.o $t/c.o -Wl,-no-relax -pie
-$QEMU $t/exe2 | grep -q '^20 3$'
+$QEMU $t/exe2 | grep '^20 3$'
 
 $CC -o $t/d.so $t/a.o -shared
 $CC -o $t/exe3 $t/b.o $t/c.o $t/d.so -pie
-$QEMU $t/exe3 | grep -q '^20 3$'
+$QEMU $t/exe3 | grep '^20 3$'
diff --git a/test/arch-i686-tlsdesc.sh b/test/arch-i686-tlsdesc.sh
index 6363b380f9..02c5976ead 100755
--- a/test/arch-i686-tlsdesc.sh
+++ b/test/arch-i686-tlsdesc.sh
@@ -34,15 +34,15 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o $t/b.o
-$QEMU $t/exe1 | grep -q 42
+$QEMU $t/exe1 | grep 42
 
 $CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,-no-relax
-$QEMU $t/exe2 | grep -q 42
+$QEMU $t/exe2 | grep 42
 
 $CC -B. -shared -o $t/c.so $t/a.o
 $CC -B. -o $t/exe3 $t/b.o $t/c.so
-$QEMU $t/exe3 | grep -q 42
+$QEMU $t/exe3 | grep 42
 
 $CC -B. -shared -o $t/c.so $t/a.o -Wl,-no-relax
 $CC -B. -o $t/exe4 $t/b.o $t/c.so -Wl,-no-relax
-$QEMU $t/exe4 | grep -q 42
+$QEMU $t/exe4 | grep 42
diff --git a/test/arch-loongarch64-mcmodel-extreme.sh b/test/arch-loongarch64-mcmodel-extreme.sh
index 3fc099ec23..e180ca4b31 100755
--- a/test/arch-loongarch64-mcmodel-extreme.sh
+++ b/test/arch-loongarch64-mcmodel-extreme.sh
@@ -8,4 +8,4 @@ int main() { printf(msg); }
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o
-$QEMU $t/exe1 | grep -q 'Hello world'
+$QEMU $t/exe1 | grep 'Hello world'
diff --git a/test/arch-loongarch64-relax-call36.sh b/test/arch-loongarch64-relax-call36.sh
index 34e40982a1..538880ae93 100755
--- a/test/arch-loongarch64-relax-call36.sh
+++ b/test/arch-loongarch64-relax-call36.sh
@@ -38,15 +38,15 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o $t/b.o -Wl,--no-relax
-$QEMU $t/exe1 | grep -q foofoo
+$QEMU $t/exe1 | grep foofoo
 
 $OBJDUMP -d $t/exe1 > $t/exe1.objdump
-grep -A2 '<foo>:' $t/exe1.objdump | grep -wq pcaddu18i
-grep -A2 '<bar>:' $t/exe1.objdump | grep -wq pcaddu18i
+grep -A2 '<foo>:' $t/exe1.objdump | grep -w pcaddu18i
+grep -A2 '<bar>:' $t/exe1.objdump | grep -w pcaddu18i
 
 $CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,--relax
-$QEMU $t/exe2 | grep -q foofoo
+$QEMU $t/exe2 | grep foofoo
 
 $OBJDUMP -d $t/exe2 > $t/exe2.objdump
-grep -A2 '<foo>:' $t/exe2.objdump | grep -wq bl
-grep -A2 '<bar>:' $t/exe2.objdump | grep -wq b
+grep -A2 '<foo>:' $t/exe2.objdump | grep -w bl
+grep -A2 '<bar>:' $t/exe2.objdump | grep -w b
diff --git a/test/arch-loongarch64-relax-got-load.sh b/test/arch-loongarch64-relax-got-load.sh
index 279fa8b5fd..76a7dda123 100755
--- a/test/arch-loongarch64-relax-got-load.sh
+++ b/test/arch-loongarch64-relax-got-load.sh
@@ -17,17 +17,17 @@ int main() { printf("%d\n", get_foo()); }
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o -pie -Wl,--no-relax
-$QEMU $t/exe1 | grep -q '^3$'
-$OBJDUMP -d $t/exe1 | grep -A2 '<get_foo>:' | grep -Fqw pcalau12i
-$OBJDUMP -d $t/exe1 | grep -A2 '<get_foo>:' | grep -Fqw ld.d
+$QEMU $t/exe1 | grep '^3$'
+$OBJDUMP -d $t/exe1 | grep -A2 '<get_foo>:' | grep -Fw pcalau12i
+$OBJDUMP -d $t/exe1 | grep -A2 '<get_foo>:' | grep -Fw ld.d
 
 $CC -B. -o $t/exe2 $t/a.o $t/b.o $t/c.o -pie -Wl,--relax
-$QEMU $t/exe2 | grep -q '^3$'
-$OBJDUMP -d $t/exe2 | grep -A1 '<get_foo>:' | grep -Fqw pcaddi
+$QEMU $t/exe2 | grep '^3$'
+$OBJDUMP -d $t/exe2 | grep -A1 '<get_foo>:' | grep -Fw pcaddi
 
 $CC -B. -o $t/exe3 $t/a.o $t/b.o $t/c.o -pie -Wl,--relax \
   -Wl,-Ttext=0x1000000,-Tdata=0x2000000
 
-$QEMU $t/exe3 | grep -q '^3$'
-$OBJDUMP -d $t/exe3 | grep -A2 '<get_foo>:' | grep -Fqw pcalau12i
-$OBJDUMP -d $t/exe3 | grep -A2 '<get_foo>:' | grep -Fqw addi.d
+$QEMU $t/exe3 | grep '^3$'
+$OBJDUMP -d $t/exe3 | grep -A2 '<get_foo>:' | grep -Fw pcalau12i
+$OBJDUMP -d $t/exe3 | grep -A2 '<get_foo>:' | grep -Fw addi.d
diff --git a/test/arch-loongarch64-relax-pcala-addi.sh b/test/arch-loongarch64-relax-pcala-addi.sh
index fe26c73cb6..8f18c1f7b3 100755
--- a/test/arch-loongarch64-relax-pcala-addi.sh
+++ b/test/arch-loongarch64-relax-pcala-addi.sh
@@ -42,17 +42,17 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o -Wl,--no-relax
-$QEMU $t/exe1 | grep -q '^1 2 3$'
+$QEMU $t/exe1 | grep '^1 2 3$'
 
 $OBJDUMP -d $t/exe1 > $t/exe1.objdump
-grep -A1 '<get_sym1>:' $t/exe1.objdump | grep -q pcalau12i
-grep -A1 '<get_sym2>:' $t/exe1.objdump | grep -q pcalau12i
-grep -A1 '<get_sym3>:' $t/exe1.objdump | grep -q pcalau12i
+grep -A1 '<get_sym1>:' $t/exe1.objdump | grep pcalau12i
+grep -A1 '<get_sym2>:' $t/exe1.objdump | grep pcalau12i
+grep -A1 '<get_sym3>:' $t/exe1.objdump | grep pcalau12i
 
 $CC -B. -o $t/exe2 $t/a.o $t/b.o $t/c.o -Wl,--relax
-$QEMU $t/exe2 | grep -q '^1 2 3$'
+$QEMU $t/exe2 | grep '^1 2 3$'
 
 $OBJDUMP -d $t/exe2 > $t/exe2.objdump
-grep -A1 '<get_sym1>:' $t/exe2.objdump | grep -q pcaddi
-grep -A1 '<get_sym2>:' $t/exe2.objdump | grep -q pcaddi
-grep -A1 '<get_sym3>:' $t/exe2.objdump | grep -q pcalau12i
+grep -A1 '<get_sym1>:' $t/exe2.objdump | grep pcaddi
+grep -A1 '<get_sym2>:' $t/exe2.objdump | grep pcaddi
+grep -A1 '<get_sym3>:' $t/exe2.objdump | grep pcalau12i
diff --git a/test/arch-loongarch64-relax-tlsdesc.sh b/test/arch-loongarch64-relax-tlsdesc.sh
index 37b4471590..221bc49898 100755
--- a/test/arch-loongarch64-relax-tlsdesc.sh
+++ b/test/arch-loongarch64-relax-tlsdesc.sh
@@ -29,15 +29,15 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o $t/d.o -Wl,--no-relax
-$QEMU $t/exe1 | grep -q 'foo bar'
+$QEMU $t/exe1 | grep 'foo bar'
 
 $OBJDUMP -d $t/exe1 > $t/exe1.objdump
-grep -A6 '<get_foo>:' $t/exe1.objdump | grep -Fq pcalau12i
-grep -A6 '<get_bar>:' $t/exe1.objdump | grep -Fq pcalau12i
+grep -A6 '<get_foo>:' $t/exe1.objdump | grep -F pcalau12i
+grep -A6 '<get_bar>:' $t/exe1.objdump | grep -F pcalau12i
 
 $CC -B. -o $t/exe2 $t/a.o $t/b.o $t/c.o $t/d.o -Wl,--relax
-$QEMU $t/exe2 | grep -q 'foo bar'
+$QEMU $t/exe2 | grep 'foo bar'
 
 $OBJDUMP -d $t/exe2 > $t/exe2.objdump
-grep -A6 '<get_foo>:' $t/exe2.objdump | grep -Fq li.w
-grep -A6 '<get_bar>:' $t/exe2.objdump | grep -Fq lu12i.w
+grep -A6 '<get_foo>:' $t/exe2.objdump | grep -F li.w
+grep -A6 '<get_bar>:' $t/exe2.objdump | grep -F lu12i.w
diff --git a/test/arch-ppc64le-save-restore-gprs.sh b/test/arch-ppc64le-save-restore-gprs.sh
index 2a2fd10158..d60a1f8004 100755
--- a/test/arch-ppc64le-save-restore-gprs.sh
+++ b/test/arch-ppc64le-save-restore-gprs.sh
@@ -9,4 +9,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o
-$OBJDUMP -d $t/exe | grep -q '<_savegpr0_14>'
+$OBJDUMP -d $t/exe | grep '<_savegpr0_14>'
diff --git a/test/arch-riscv64-attributes.sh b/test/arch-riscv64-attributes.sh
index 9ea4270d85..8f3267ced1 100755
--- a/test/arch-riscv64-attributes.sh
+++ b/test/arch-riscv64-attributes.sh
@@ -6,7 +6,7 @@ void foo() {}
 EOF
 
 # The compiler might not create .riscv.attributes
-readelf --sections $t/a.o | grep -Fq .riscv.attributes || skip
+readelf --sections $t/a.o | grep -F .riscv.attributes || skip
 
 cat <<EOF | $CC -march=rv64imafd_xtheadba2p0 -o $t/b.o -c -xc -
 void bar() {}
@@ -17,7 +17,7 @@ void _start() {}
 EOF
 
 $CC -B. -nostdlib -o $t/exe $t/a.o $t/c.o
-readelf -A $t/exe | grep -q '_xtheadba1p5"'
+readelf -A $t/exe | grep '_xtheadba1p5"'
 
 $CC -B. -nostdlib -o $t/exe $t/a.o $t/b.o $t/c.o
-readelf -A $t/exe | grep -q '_xtheadba2p0"'
+readelf -A $t/exe | grep '_xtheadba2p0"'
diff --git a/test/arch-riscv64-attributes2.sh b/test/arch-riscv64-attributes2.sh
index ee108a7c63..0f9bef1537 100755
--- a/test/arch-riscv64-attributes2.sh
+++ b/test/arch-riscv64-attributes2.sh
@@ -10,5 +10,5 @@ $STRIP --remove-section=.riscv.attributes $t/a.o
 $CC -B. -nostdlib -o $t/exe $t/a.o
 
 readelf -W --segments --sections $t/exe > $t/log
-! grep -F .riscv.attributes $t/log || false
-! grep -F RISCV_ATTR $t/log || false
+not grep -F .riscv.attributes $t/log
+not grep -F RISCV_ATTR $t/log
diff --git a/test/arch-riscv64-global-pointer-dso.sh b/test/arch-riscv64-global-pointer-dso.sh
index 7f6fef37c7..10a2759eca 100755
--- a/test/arch-riscv64-global-pointer-dso.sh
+++ b/test/arch-riscv64-global-pointer-dso.sh
@@ -24,4 +24,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/b.so $t/c.o $t/d.o
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
diff --git a/test/arch-riscv64-global-pointer.sh b/test/arch-riscv64-global-pointer.sh
index b184e8d5b3..6b40befa94 100755
--- a/test/arch-riscv64-global-pointer.sh
+++ b/test/arch-riscv64-global-pointer.sh
@@ -9,10 +9,10 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o -fno-PIE
-readelf -W --dyn-syms $t/exe1 | grep -Fq '__global_pointer$'
+readelf -W --dyn-syms $t/exe1 | grep -F '__global_pointer$'
 
 $CC -B. -o $t/exe2 $t/a.o -fPIE
-readelf -W --dyn-syms $t/exe2 | grep -Fq '__global_pointer$'
+readelf -W --dyn-syms $t/exe2 | grep -F '__global_pointer$'
 
 cat <<EOF | $CC -o $t/b.o -c -xc - -fPIC
 #include <stdio.h>
@@ -22,5 +22,4 @@ int hello() {
 EOF
 
 $CC -B. -o $t/c.so $t/b.o -shared
-readelf -W --dyn-syms $t/c.so > $t/log1
-! grep -Fq '__global_pointer$' $t/log1 || false
+readelf -W --dyn-syms $t/c.so | not grep -F '__global_pointer$'
diff --git a/test/arch-riscv64-norvc.sh b/test/arch-riscv64-norvc.sh
deleted file mode 100755
index 1383d4e094..0000000000
--- a/test/arch-riscv64-norvc.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-. $(dirname $0)/common.inc
-
-# Disable C extension
-if [ $MACHINE = riscv32 ]; then
-    ISA=rv32g
-else
-    ISA=rv64g
-fi
-
-cat <<EOF | $CC -march=$ISA -O2 -o $t/a.o -c -xc -
-int add1(int n) { return n + 1; }
-EOF
-
-cat <<EOF | $CC -march=$ISA -O2 -o $t/b.o -c -xc -
-int add1(int n);
-int add2(int n) { n += 1; return add1(n); }
-EOF
-
-cat <<EOF | $CC -march=$ISA -O2 -o $t/c.o -c -xc -
-int add2(int n);
-int main() {
-  add2(0);
-  return 0;
-}
-EOF
-
-$CC -march=$ISA -B. -nostdlib -O2 -o $t/exe $t/a.o $t/b.o $t/c.o
-
-$OBJDUMP -d $t/exe | grep -q ff5ff06f # j pc - 0xc
diff --git a/test/arch-riscv64-obj-compatible.sh b/test/arch-riscv64-obj-compatible.sh
index 1eb47fc0c3..bf9c67ffce 100755
--- a/test/arch-riscv64-obj-compatible.sh
+++ b/test/arch-riscv64-obj-compatible.sh
@@ -18,5 +18,5 @@ int main() {
 }
 EOF
 
-! $CC -B. -o $t/exe $t/a.o $t/b.o 2> $t/log || false
-grep -q 'cannot link object files with different floating-point ABI' $t/log
+not $CC -B. -o $t/exe $t/a.o $t/b.o |&
+  grep 'cannot link object files with different floating-point ABI'
diff --git a/test/arch-riscv64-relax-align.sh b/test/arch-riscv64-relax-align.sh
new file mode 100755
index 0000000000..b88ec6b4d5
--- /dev/null
+++ b/test/arch-riscv64-relax-align.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -o $t/a.o -c -xassembler -
+.globl x1
+.text
+.p2align 5
+x1:
+  ret
+EOF
+
+cat <<EOF | $CC -o $t/b.o -c -xassembler -
+.globl x2
+.text
+.p2align 5
+x2:
+  ret
+EOF
+
+cat <<EOF | $CC -o $t/c.o -c -xc -
+#include <stdio.h>
+#include <stdint.h>
+extern char x1, x2;
+int main() {
+  printf("%lu %lu %lu\n",
+         (uintptr_t)&x1 % 32,
+         (uintptr_t)&x2 % 32,
+         &x2 - &x1);
+}
+EOF
+
+$CC -B. -o $t/exe $t/a.o $t/b.o $t/c.o
+$QEMU $t/exe | grep '0 0 32'
diff --git a/test/arch-riscv64-relax-got.sh b/test/arch-riscv64-relax-got.sh
index 14e3335523..364fcc2597 100755
--- a/test/arch-riscv64-relax-got.sh
+++ b/test/arch-riscv64-relax-got.sh
@@ -71,9 +71,9 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o -Wl,--no-relax
-$QEMU $t/exe1 | grep -Eq '^0 ba beef 11beef deadbeef$'
+$QEMU $t/exe1 | grep -E '^0 ba beef 11beef deadbeef$'
 
 $CC -B. -o $t/exe2 $t/a.o $t/b.o $t/c.o
-$QEMU $t/exe2 | grep -Eq '^0 ba beef 11beef deadbeef$'
+$QEMU $t/exe2 | grep -E '^0 ba beef 11beef deadbeef$'
 
-$OBJDUMP -d $t/exe2 | grep -A2 '<get_sym2>:' | grep -Eq $'li[ \t]+a0,186$'
+$OBJDUMP -d $t/exe2 | grep -A2 '<get_sym2>:' | grep -E $'li[ \t]+a0,186$'
diff --git a/test/arch-riscv64-relax-hi20.sh b/test/arch-riscv64-relax-hi20.sh
index fb4774eba3..55cd038fd0 100755
--- a/test/arch-riscv64-relax-hi20.sh
+++ b/test/arch-riscv64-relax-hi20.sh
@@ -2,29 +2,38 @@
 . $(dirname $0)/common.inc
 
 cat <<EOF | $CC -o $t/a.o -c -xassembler -
-.globl get_foo, get_foo2, get_bar, get_baz
+.globl get_foo, get_foo2, get_bar, get_bar2, get_baz
 get_foo:
   lui a0, %hi(foo)
   add a0, a0, %lo(foo)
   ret
+.size get_foo, .-get_foo
 get_foo2:
   lui a0, %hi(foo+0x10000000)
   add a0, a0, %lo(foo)
   ret
+.size get_foo2, .-get_foo2
 get_bar:
   lui a0, %hi(bar)
   add a0, a0, %lo(bar)
   ret
+.size get_bar, .-get_bar
+get_bar2:
+  lui a0, %hi(bar+0x1ffff)
+  add a0, a0, %lo(bar+0x1ffff)
+  ret
+.size get_bar2, .-get_bar2
 get_baz:
   lui a0, %hi(baz)
   add a0, a0, %lo(baz)
   ret
+.size get_baz, .-get_baz
 EOF
 
 cat <<EOF | $CC -o $t/b.o -c -xassembler -
 .globl foo, bar, baz
 foo = 0xf00
-bar = 0xba
+bar = 0
 baz = 0x11beef
 EOF
 
@@ -34,17 +43,31 @@ cat <<EOF | $CC -o $t/c.o -c -xc -
 int get_foo();
 int get_foo2();
 int get_bar();
+int get_bar2();
 int get_baz();
 
 int main() {
-  printf("%x %x %x %x\n", get_foo(), get_foo2(), get_bar(), get_baz());
+  printf("%x %x %x %x %x\n",
+         get_foo(), get_foo2(), get_bar(), get_bar2(), get_baz());
 }
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o -Wl,--no-relax
-$QEMU $t/exe1 | grep -q 'f00 10000f00 ba 11beef'
+$QEMU $t/exe1 | grep 'f00 10000f00 0 1ffff 11beef'
 
 $CC -B. -o $t/exe2 $t/a.o $t/b.o $t/c.o
-$QEMU $t/exe2 | grep -q 'f00 10000f00 ba 11beef'
+$QEMU $t/exe2 | grep 'f00 10000f00 0 1ffff 11beef'
+
+readelf --syms $t/exe1 > $t/log1
+grep -E ' 10 NOTYPE .* get_foo$' $t/log1
+grep -E ' 10 NOTYPE .* get_foo2$' $t/log1
+grep -E ' 10 NOTYPE .* get_bar$' $t/log1
+grep -E ' 10 NOTYPE .* get_bar2$' $t/log1
+grep -E ' 10 NOTYPE .* get_baz$' $t/log1
 
-[ $(stat --format='%s' $t/exe1) -gt $(stat --format='%s' $t/exe2) ]
+readelf --syms $t/exe2 > $t/log2
+grep -E ' 8 NOTYPE .* get_foo$' $t/log2
+grep -E ' 10 NOTYPE .* get_foo2$' $t/log2
+grep -E ' 6 NOTYPE .* get_bar$' $t/log2
+grep -E ' 10 NOTYPE .* get_bar2$' $t/log2
+grep -E ' 10 NOTYPE .* get_baz$' $t/log2
diff --git a/test/arch-riscv64-relax-j.sh b/test/arch-riscv64-relax-j.sh
new file mode 100755
index 0000000000..8046721252
--- /dev/null
+++ b/test/arch-riscv64-relax-j.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -o $t/a.o -c -xc - -O2 -march=rv64g
+void g() {}
+EOF
+
+cat <<EOF | $CC -o $t/b.o -c -xc - -O2 -march=rv64g
+void g();
+void f() { g(); }
+int main() { f(); }
+EOF
+
+$CC -B. -march=rv64g -o $t/exe1 $t/a.o $t/b.o
+$QEMU $t/exe1
+$OBJDUMP -d $t/exe1 | grep -E '\bj\b.*<g>'
+
+
+cat <<EOF | $CC -o $t/c.o -c -xc - -O2 -march=rv64gc
+void g() {}
+EOF
+
+cat <<EOF | $CC -o $t/d.o -c -xc - -O2 -march=rv64gc
+void g();
+void f() { g(); }
+int main() { f(); }
+EOF
+
+$CC -B. -march=rv64g -o $t/exe2 $t/c.o $t/d.o
+$QEMU $t/exe2
+$OBJDUMP -d $t/exe2 | grep -E '\bj\b.*<g>'
diff --git a/test/arch-riscv64-symbol-size.sh b/test/arch-riscv64-symbol-size.sh
new file mode 100755
index 0000000000..1be069a85a
--- /dev/null
+++ b/test/arch-riscv64-symbol-size.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -o $t/a.o -c -xassembler -
+.globl get_foo, get_bar
+.type get_foo @function
+get_foo:
+  lui a0, %hi(foo)
+  add a0, a0, %lo(foo)
+  ret
+.size get_foo, .-get_foo
+
+.type get_bar @function
+get_bar:
+  lui a0, %hi(bar)
+  add a0, a0, %lo(bar)
+  ret
+.size get_bar, .-get_bar
+EOF
+
+cat <<EOF | $CC -o $t/b.o -c -xassembler -
+.globl foo, bar
+foo = 0xf00
+bar = 0xf00
+EOF
+
+cat <<EOF | $CC -o $t/c.o -c -xc -
+#include <stdio.h>
+int get_foo();
+int get_bar();
+int main() { printf("%x %x\n", get_foo(), get_bar()); }
+EOF
+
+$CC -B. -o $t/exe $t/a.o $t/b.o $t/c.o
+
+readelf --syms $t/a.o | grep -E ' 10 FUNC .* get_foo$'
+readelf --syms $t/a.o | grep -E ' 10 FUNC .* get_bar$'
+
+readelf --syms $t/exe | grep -E ' 8 FUNC .* get_foo$'
+readelf --syms $t/exe | grep -E ' 8 FUNC .* get_bar$'
diff --git a/test/arch-riscv64-variant-cc.sh b/test/arch-riscv64-variant-cc.sh
index af6a268312..d3672a5b04 100755
--- a/test/arch-riscv64-variant-cc.sh
+++ b/test/arch-riscv64-variant-cc.sh
@@ -10,7 +10,7 @@ foo:
 EOF
 
 $CC -B. -shared -o $t/b.so $t/a.o
-readelf -W --dyn-syms $t/b.so | grep foo | grep -Fq '[VARIANT_CC]'
+readelf -W --dyn-syms $t/b.so | grep foo | grep -F '[VARIANT_CC]'
 
 cat <<EOF | $CC -c -o $t/c.o -xc -
 void foo();
@@ -18,4 +18,4 @@ int main() { foo(); }
 EOF
 
 $CC -B. -o $t/exe $t/c.o $t/b.so
-readelf -W --dynamic $t/exe | grep -q RISCV_VARIANT_CC
+readelf -W --dynamic $t/exe | grep RISCV_VARIANT_CC
diff --git a/test/arch-riscv64-weak-undef.sh b/test/arch-riscv64-weak-undef.sh
index 9d99db4c9c..9f3bb482ba 100755
--- a/test/arch-riscv64-weak-undef.sh
+++ b/test/arch-riscv64-weak-undef.sh
@@ -18,4 +18,4 @@ int main() { printf("%ld\n", foo()); }
 EOF
 
 $CC -B. -static -o $t/exe $t/a.o $t/b.o
-$QEMU $t/exe | grep -q '^0$'
+$QEMU $t/exe | grep '^0$'
diff --git a/test/arch-s390x-got.sh b/test/arch-s390x-got.sh
index ac061d4f46..d9eb078920 100755
--- a/test/arch-s390x-got.sh
+++ b/test/arch-s390x-got.sh
@@ -15,4 +15,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o -Wl,-defsym=got=_GLOBAL_OFFSET_TABLE_ -no-pie
-$QEMU $t/exe | grep -Eq '^1'
+$QEMU $t/exe | grep -E '^1'
diff --git a/test/arch-x86_64-address-equality.sh b/test/arch-x86_64-address-equality.sh
index ccdf752893..6ffa64c6d6 100755
--- a/test/arch-x86_64-address-equality.sh
+++ b/test/arch-x86_64-address-equality.sh
@@ -22,7 +22,7 @@ EOF
 $CC -B. -shared -o $t/d.so $t/c.o
 
 $CC -B. -o $t/exe1 $t/a.o $t/b.o $t/d.so -pie
-$QEMU $t/exe1 | grep -q '^1 '
+$QEMU $t/exe1 | grep '^1 '
 
 $CC -B. -o $t/exe2 $t/a.o $t/b.o $t/d.so -pie -Wl,-no-relax
-$QEMU $t/exe2 | grep -q '^1 '
+$QEMU $t/exe2 | grep '^1 '
diff --git a/test/arch-x86_64-empty-mergeable-section.sh b/test/arch-x86_64-empty-mergeable-section.sh
index 59a0531a91..8e7ed85aec 100755
--- a/test/arch-x86_64-empty-mergeable-section.sh
+++ b/test/arch-x86_64-empty-mergeable-section.sh
@@ -20,5 +20,5 @@ $CC -B. -o $t/exe $t/a.o $t/b.o $t/c.o
 $t/exe
 
 readelf -SW $t/exe > $t/log
-grep -q '.rodata.str1.1 .* AMS ' $t/log
-! grep -q '.rodata.str1.1 .* AM ' $t/log || false
+grep '.rodata.str1.1 .* AMS ' $t/log
+not grep '.rodata.str1.1 .* AM ' $t/log
diff --git a/test/arch-x86_64-emulation-deduction.sh b/test/arch-x86_64-emulation-deduction.sh
index 3e903c76b4..300c1b8ab0 100755
--- a/test/arch-x86_64-emulation-deduction.sh
+++ b/test/arch-x86_64-emulation-deduction.sh
@@ -6,4 +6,4 @@ void _start() {}
 EOF
 
 ./mold -o $t/exe $t/a.o
-readelf --file-header $t/exe | grep -qi x86-64
+readelf --file-header $t/exe | grep -i x86-64
diff --git a/test/arch-x86_64-execstack-if-needed.sh b/test/arch-x86_64-execstack-if-needed.sh
index 3fae6c7605..5c1f3d1e08 100755
--- a/test/arch-x86_64-execstack-if-needed.sh
+++ b/test/arch-x86_64-execstack-if-needed.sh
@@ -9,7 +9,7 @@ main:
 EOF
 
 $CC -B. -o $t/exe $t/a.o >& /dev/null
-readelf --segments -W $t/exe | grep -q 'GNU_STACK.* RW '
+readelf --segments -W $t/exe | grep 'GNU_STACK.* RW '
 
 $CC -B. -o $t/exe $t/a.o -Wl,-z,execstack-if-needed
-readelf --segments -W $t/exe | grep -q 'GNU_STACK.* RWE '
+readelf --segments -W $t/exe | grep 'GNU_STACK.* RWE '
diff --git a/test/arch-x86_64-function-multiversion.sh b/test/arch-x86_64-function-multiversion.sh
new file mode 100755
index 0000000000..7bd011e202
--- /dev/null
+++ b/test/arch-x86_64-function-multiversion.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+supports_ifunc || skip
+[ "$(uname)" = FreeBSD ] && skip
+
+cat <<EOF | $GXX -o $t/a.o -c -xc++ - -fPIC
+#include <iostream>
+
+class Hello {
+public:
+  __attribute__((target("default"))) void say() { std::cout << "Hello\n"; }
+  __attribute__((target("popcnt")))  void say() { std::cout << "Howdy\n"; }
+};
+
+void hello() {
+  Hello().say();
+}
+EOF
+
+$CXX -B. -shared -o $t/b.so $t/a.o
+
+cat <<EOF | $CXX -o $t/c.o -c -xc++ - -fPIC
+void hello();
+int main() { hello(); }
+EOF
+
+$CXX -B. -o $t/exe $t/b.so $t/c.o
+$QEMU $t/exe | grep '^H'
diff --git a/test/arch-x86_64-gnu-linkonce.sh b/test/arch-x86_64-gnu-linkonce.sh
index c27e2993d3..ef463bae0c 100755
--- a/test/arch-x86_64-gnu-linkonce.sh
+++ b/test/arch-x86_64-gnu-linkonce.sh
@@ -21,5 +21,5 @@ EOF
 
 $CC -B. -o $t/exe $t/a.o $t/b.o $t/c.o
 $OBJDUMP -d $t/exe >& /dev/null || skip
-$OBJDUMP -d $t/exe | grep -A1 '<__x86.get_pc_thunk.bx>:' | \
-  grep -Fq 'puts$plt'
+$OBJDUMP -d $t/exe | grep -A1 '<__x86.get_pc_thunk.bx>:' |
+  grep -F 'puts$plt'
diff --git a/test/arch-x86_64-gnu-retain.sh b/test/arch-x86_64-gnu-retain.sh
index 6c4dcc1514..d86a192591 100755
--- a/test/arch-x86_64-gnu-retain.sh
+++ b/test/arch-x86_64-gnu-retain.sh
@@ -22,8 +22,7 @@ foo:
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o $t/b.o -Wl,-gc-sections
-nm $t/exe1 | grep -q foo
+nm $t/exe1 | grep foo
 
 $CC -B. -o $t/exe1 $t/a.o $t/c.o -Wl,-gc-sections
-nm $t/exe1 > $t/log
-! grep -q foo $t/log || false
+nm $t/exe1 | not grep foo
diff --git a/test/arch-x86_64-ifunc-alias.sh b/test/arch-x86_64-ifunc-alias.sh
index fd7ebb3d23..87dff3e8b0 100755
--- a/test/arch-x86_64-ifunc-alias.sh
+++ b/test/arch-x86_64-ifunc-alias.sh
@@ -4,7 +4,7 @@
 supports_ifunc || skip
 test_cflags -static || skip
 
-cat <<EOF | $CXX -o $t/a.o -c -xc++ - -fno-PIE
+cat <<EOF | $CXX -march=x86-64 -o $t/a.o -c -xc++ - -fno-PIE
 #include <assert.h>
 #include <stdio.h>
 
diff --git a/test/arch-x86_64-incompatible-libs-linker-script.sh b/test/arch-x86_64-incompatible-libs-linker-script.sh
index b40d0cff1b..d3bf31f0ce 100755
--- a/test/arch-x86_64-incompatible-libs-linker-script.sh
+++ b/test/arch-x86_64-incompatible-libs-linker-script.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
-echo 'int main() {}' | $CC -m32 -o $t/exe -xc - >& /dev/null || skip
+test_cflags -m32 || skip
 
 mkdir -p $t/foo
 
@@ -23,7 +23,7 @@ EOF
 cd $t
 
 $CC -B$OLDPWD -o exe1 -Lfoo a.o b.script
-LD_LIBRARY_PATH=. $QEMU ./exe1 | grep -q 'Hello world'
+LD_LIBRARY_PATH=. $QEMU ./exe1 | grep 'Hello world'
 
 $CC -B$OLDPWD -o exe2 -Lfoo b.script a.o
-LD_LIBRARY_PATH=. $QEMU ./exe2 | grep -q 'Hello world'
+LD_LIBRARY_PATH=. $QEMU ./exe2 | grep 'Hello world'
diff --git a/test/arch-x86_64-incompatible-libs-linker-script2.sh b/test/arch-x86_64-incompatible-libs-linker-script2.sh
index 3630692c66..db91aba058 100755
--- a/test/arch-x86_64-incompatible-libs-linker-script2.sh
+++ b/test/arch-x86_64-incompatible-libs-linker-script2.sh
@@ -1,8 +1,8 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
-nm mold | grep -q '__tsan_init' && skip
-echo 'int main() {}' | $CC -m32 -o $t/exe -xc - >& /dev/null || skip
+nm mold | grep '__tsan_init' && skip
+test_cflags -m32 || skip
 
 mkdir -p $t/foo
 
diff --git a/test/arch-x86_64-incompatible-libs.sh b/test/arch-x86_64-incompatible-libs.sh
index 3661d82bd7..a20874b1d1 100755
--- a/test/arch-x86_64-incompatible-libs.sh
+++ b/test/arch-x86_64-incompatible-libs.sh
@@ -1,8 +1,7 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
-echo 'int main() {}' | $CC -m32 -o $t/exe -xc - >& /dev/null \
-  || skip
+test_cflags -m32 || skip
 
 cat <<EOF | $CC -m32 -c -o $t/a.o -xc -
 char hello[] = "Hello world";
@@ -36,7 +35,7 @@ echo 'OUTPUT_FORMAT(elf32-i386)' > $t/script/libfoo.so
 $CC -B. -o $t/exe -L$t/script -L$t/lib32 -L$t/lib64 \
   $t/e.o -lfoo -Wl,-rpath $t/lib64 >& $t/log
 
-grep -q 'script/libfoo.so: skipping incompatible file' $t/log
-grep -q 'lib32/libfoo.so: skipping incompatible file' $t/log
-grep -q 'lib32/libfoo.a: skipping incompatible file' $t/log
-$QEMU $t/exe | grep -q 'Hello world'
+grep 'script/libfoo.so: skipping incompatible file' $t/log
+grep 'lib32/libfoo.so: skipping incompatible file' $t/log
+grep 'lib32/libfoo.a: skipping incompatible file' $t/log
+$QEMU $t/exe | grep 'Hello world'
diff --git a/test/arch-x86_64-incompatible-libs2.sh b/test/arch-x86_64-incompatible-libs2.sh
index 167045ea8e..fdf59c2398 100755
--- a/test/arch-x86_64-incompatible-libs2.sh
+++ b/test/arch-x86_64-incompatible-libs2.sh
@@ -1,8 +1,8 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
-echo 'int main() {}' | $CC -m32 -o $t/exe -xc - >& /dev/null \
-  || skip
+test_cflags -m32 || skip
+nm mold | grep '__tsan_init' && skip
 
 cat <<EOF | $CC -m32 -c -o $t/a.o -xc -
 char hello[] = "Hello world";
@@ -31,8 +31,7 @@ EOF
 mkdir -p $t/script
 echo 'GROUP(libfoo.so)' > $t/script/libfoo.so
 
-$CC -B. -o $t/exe -L$t/lib32 -L$t/lib64 -lfoo $t/e.o -Wl,-rpath $t/lib64 \
-  >& $t/log
+$CC -B. -o $t/exe -L$t/lib32 -L$t/lib64 -lfoo $t/e.o -Wl,-rpath $t/lib64 |&
+  grep 'lib32/libfoo.so: skipping incompatible file'
 
-grep -q 'lib32/libfoo.so: skipping incompatible file' $t/log
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
diff --git a/test/arch-x86_64-incompatible-obj.sh b/test/arch-x86_64-incompatible-obj.sh
index 9f073a9782..3d7f7b0fef 100755
--- a/test/arch-x86_64-incompatible-obj.sh
+++ b/test/arch-x86_64-incompatible-obj.sh
@@ -1,8 +1,7 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
-echo 'int main() {}' | $CC -m32 -o $t/exe -xc - >& /dev/null \
-  || skip
+test_cflags -m32 || skip
 
 cat <<EOF | $CC -c -o $t/a.o -m64 -xc -
 int main() {}
@@ -11,5 +10,5 @@ EOF
 cat <<EOF | $CC -c -o $t/b.o -m32 -xc -
 EOF
 
-! $CC -B. -o /dev/null $t/a.o $t/b.o >& $t/log
-grep -q "$t/b.o: incompatible file type: x86_64 is expected but got i386" $t/log
+not $CC -B. -o /dev/null $t/a.o $t/b.o >& $t/log
+grep "$t/b.o: incompatible file type: x86_64 is expected but got i386" $t/log
diff --git a/test/arch-x86_64-init-array-readonly.sh b/test/arch-x86_64-init-array-readonly.sh
index 2e6c41f2c0..8b6967e6a7 100755
--- a/test/arch-x86_64-init-array-readonly.sh
+++ b/test/arch-x86_64-init-array-readonly.sh
@@ -31,4 +31,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o $t/b.o $t/c.o
-$QEMU $t/exe | grep -q 'init1 init2'
+$QEMU $t/exe | grep 'init1 init2'
diff --git a/test/arch-x86_64-init-array.sh b/test/arch-x86_64-init-array.sh
index 38fb58c00e..20e7d9c463 100755
--- a/test/arch-x86_64-init-array.sh
+++ b/test/arch-x86_64-init-array.sh
@@ -35,4 +35,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o $t/b.o
-$QEMU $t/exe | grep -q 'init1 init2 fini2 fini1'
+$QEMU $t/exe | grep 'init1 init2 fini2 fini1'
diff --git a/test/arch-x86_64-isa-level.sh b/test/arch-x86_64-isa-level.sh
index d51afd8dea..2c2882012e 100755
--- a/test/arch-x86_64-isa-level.sh
+++ b/test/arch-x86_64-isa-level.sh
@@ -6,12 +6,12 @@ int main() {}
 EOF
 
 $CC -B. -o $t/exe2 $t/a.o -Wl,-z,x86-64-v2
-readelf -n $t/exe2 | grep -Fq 'Unknown note type: (0x00000005)' && skip
-readelf -n $t/exe2 | grep -Fq 'procesor-specific type 0xc0008002' && skip
-readelf -n $t/exe2 | grep -q 'x86 ISA needed: .*x86-64-v2'
+readelf -n $t/exe2 | grep -F 'Unknown note type: (0x00000005)' && skip
+readelf -n $t/exe2 | grep -F 'procesor-specific type 0xc0008002' && skip
+readelf -n $t/exe2 | grep 'x86 ISA needed: .*x86-64-v2'
 
 $CC -B. -o $t/exe3 $t/a.o -Wl,-z,x86-64-v3
-readelf -n $t/exe3 | grep -q 'x86 ISA needed: .*x86-64-v3'
+readelf -n $t/exe3 | grep 'x86 ISA needed: .*x86-64-v3'
 
 $CC -B. -o $t/exe4 $t/a.o -Wl,-z,x86-64-v4
-readelf -n $t/exe4 | grep -q 'x86 ISA needed: .*x86-64-v4'
+readelf -n $t/exe4 | grep 'x86 ISA needed: .*x86-64-v4'
diff --git a/test/arch-x86_64-large-bss.sh b/test/arch-x86_64-large-bss.sh
index df24d3ad60..62fcd57014 100755
--- a/test/arch-x86_64-large-bss.sh
+++ b/test/arch-x86_64-large-bss.sh
@@ -15,4 +15,4 @@ char arr2[0xc0000000];
 EOF
 
 $CC -B. -o $t/exe $t/a.o $t/b.o
-$QEMU $t/exe | grep -Eq '^1 c0000000$'
+$QEMU $t/exe | grep -E '^1 c0000000$'
diff --git a/test/arch-x86_64-mergeable-records.sh b/test/arch-x86_64-mergeable-records.sh
index 624089840a..d5ebe05305 100755
--- a/test/arch-x86_64-mergeable-records.sh
+++ b/test/arch-x86_64-mergeable-records.sh
@@ -46,8 +46,8 @@ EOF
 
 $CC -B. -static -o $t/exe $t/a.o
 
-$QEMU $t/exe | grep -q '^abcdef$'
-$QEMU $t/exe | grep -q '^bcdef$'
-$QEMU $t/exe | grep -q '^ijkl$'
-$QEMU $t/exe | grep -q '^pqr$'
-$QEMU $t/exe | grep -q '^mnopqr$'
+$QEMU $t/exe | grep '^abcdef$'
+$QEMU $t/exe | grep '^bcdef$'
+$QEMU $t/exe | grep '^ijkl$'
+$QEMU $t/exe | grep '^pqr$'
+$QEMU $t/exe | grep '^mnopqr$'
diff --git a/test/arch-x86_64-mergeable-strings-nonalloc.sh b/test/arch-x86_64-mergeable-strings-nonalloc.sh
index 3d817d36dd..b1cc3b6e49 100755
--- a/test/arch-x86_64-mergeable-strings-nonalloc.sh
+++ b/test/arch-x86_64-mergeable-strings-nonalloc.sh
@@ -19,5 +19,5 @@ EOF
 
 $CC -B. -o $t/exe $t/a.o $t/b.o
 
-readelf -x .foo $t/exe | grep -Fq '03000000 00000000 ffffffff ffffffff'
-readelf -x .bar $t/exe | grep -Fq 'xyz.abc.'
+readelf -x .foo $t/exe | grep -F '03000000 00000000 ffffffff ffffffff'
+readelf -x .bar $t/exe | grep -F 'xyz.abc.'
diff --git a/test/arch-x86_64-mergeable-strings.sh b/test/arch-x86_64-mergeable-strings.sh
index 5d6148c2d0..0f9918902d 100755
--- a/test/arch-x86_64-mergeable-strings.sh
+++ b/test/arch-x86_64-mergeable-strings.sh
@@ -27,6 +27,6 @@ foo:
 EOF
 
 $CC -B. -static -o $t/exe $t/a.o
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
 
-readelf -sW $t/exe | grep -Eq '[0-9] foo$'
+readelf -sW $t/exe | grep -E '[0-9] foo$'
diff --git a/test/arch-x86_64-note-property.sh b/test/arch-x86_64-note-property.sh
index 55286b1787..d456ef5e2e 100755
--- a/test/arch-x86_64-note-property.sh
+++ b/test/arch-x86_64-note-property.sh
@@ -2,8 +2,7 @@
 . $(dirname $0)/common.inc
 
 echo endbr64 | $CC -o /dev/null -c -xassembler - 2> /dev/null || skip
-
-$CC -fcf-protection=branch -c /dev/null -o /dev/null -xc 2> /dev/null || skip
+test_cflags -fcf-protection=branch || skip
 
 cat <<EOF | $CC -fcf-protection=branch -c -o $t/a.o -xc -
 void _start() {}
@@ -14,7 +13,7 @@ void _start() {}
 EOF
 
 ./mold -o $t/exe $t/a.o
-readelf -n $t/exe | grep -q 'x86 feature: IBT'
+readelf -n $t/exe | grep 'x86 feature: IBT'
 
 ./mold -o $t/exe $t/b.o
-! readelf -n $t/exe | grep -q 'x86 feature: IBT' || false
+readelf -n $t/exe | not grep 'x86 feature: IBT'
diff --git a/test/arch-x86_64-note-property2.sh b/test/arch-x86_64-note-property2.sh
index 903de33182..61701b72f4 100755
--- a/test/arch-x86_64-note-property2.sh
+++ b/test/arch-x86_64-note-property2.sh
@@ -2,13 +2,13 @@
 . $(dirname $0)/common.inc
 
 # OneTBB isn't tsan-clean
-nm mold | grep -q '__tsan_init' && skip
+nm mold | grep '__tsan_init' && skip
 
 # Binutils 2.32 injects their own .note.gnu.property section interfering with the tests
 test_cflags -Xassembler -mx86-used-note=no && CFLAGS="-Xassembler -mx86-used-note=no" || CFLAGS=""
 
 # This test requires the new ISA constants defined in Binutils 2.36
-readelf -v | grep -q -E "GNU readelf .+ (2\.3[6-9]|2\.[4-9]|[3-9])" || skip
+readelf -v | grep -E "GNU readelf .+ (2\.3[6-9]|2\.[4-9]|[3-9])" || skip
 
 cat <<EOF | $CC $CFLAGS -c -o $t/a.o -xassembler-with-cpp  -
 #define NT_GNU_PROPERTY_TYPE_0 5
@@ -68,16 +68,16 @@ cat <<EOF | $CC $CFLAGS -c -o $t/c.o -xassembler-with-cpp -
 EOF
 
 ./mold -nostdlib -o $t/exe1 $t/a.o $t/b.o
-readelf -n $t/exe1 | grep -q 'x86 feature: SHSTK'
-readelf -n $t/exe1 | grep -q 'x86 ISA needed: x86-64-baseline, x86-64-v2, x86-64-v3, x86-64-v4'
-readelf -n $t/exe1 | grep -q 'x86 ISA used: x86-64-baseline, x86-64-v2, x86-64-v3, x86-64-v4'
+readelf -n $t/exe1 | grep 'x86 feature: SHSTK'
+readelf -n $t/exe1 | grep 'x86 ISA needed: x86-64-baseline, x86-64-v2, x86-64-v3, x86-64-v4'
+readelf -n $t/exe1 | grep 'x86 ISA used: x86-64-baseline, x86-64-v2, x86-64-v3, x86-64-v4'
 
 ./mold -nostdlib -o $t/exe2 $t/a.o $t/b.o $t/c.o
-! readelf -n $t/exe2 | grep -q 'x86 feature: SHSTK' || false
-readelf -n $t/exe2 | grep -q 'x86 ISA needed: x86-64-baseline, x86-64-v2, x86-64-v3, x86-64-v4' || false
-! readelf -n $t/exe2 | grep -q 'x86 ISA used: x86-64-baseline, x86-64-v2, x86-64-v3, x86-64-v4'
+readelf -n $t/exe2 | not grep 'x86 feature: SHSTK'
+readelf -n $t/exe2 | grep 'x86 ISA needed: x86-64-baseline, x86-64-v2, x86-64-v3, x86-64-v4'
+readelf -n $t/exe2 | not grep 'x86 ISA used: x86-64-baseline, x86-64-v2, x86-64-v3, x86-64-v4'
 
 ./mold --relocatable -o $t/d.o $t/a.o $t/b.o
-readelf -n $t/d.o | grep -q 'x86 feature: SHSTK'
-readelf -n $t/d.o | grep -q 'x86 ISA needed: x86-64-baseline, x86-64-v2, x86-64-v3, x86-64-v4'
-readelf -n $t/d.o | grep -q 'x86 ISA used: x86-64-baseline, x86-64-v2, x86-64-v3, x86-64-v4'
+readelf -n $t/d.o | grep 'x86 feature: SHSTK'
+readelf -n $t/d.o | grep 'x86 ISA needed: x86-64-baseline, x86-64-v2, x86-64-v3, x86-64-v4'
+readelf -n $t/d.o | grep 'x86 ISA used: x86-64-baseline, x86-64-v2, x86-64-v3, x86-64-v4'
diff --git a/test/arch-x86_64-note.sh b/test/arch-x86_64-note.sh
index 51aa68d4a1..3db7e6190e 100755
--- a/test/arch-x86_64-note.sh
+++ b/test/arch-x86_64-note.sh
@@ -30,12 +30,12 @@ _start:
 EOF
 
 ./mold -static -o $t/exe $t/a.o
-readelf -W --sections $t/exe > $t/log
+readelf -W --sections $t/exe > $t/log1
 
-grep -Eq '.note.bar\s+NOTE.+000008 00   A  0   0  4' $t/log
-grep -Eq '.note.baz\s+NOTE.+000008 00   A  0   0  8' $t/log
-grep -Eq '.note.nonalloc\s+NOTE.+000008 00      0   0  1' $t/log
+grep -E '.note.bar\s+NOTE.+000008 00   A  0   0  4' $t/log1
+grep -E '.note.baz\s+NOTE.+000008 00   A  0   0  8' $t/log1
+grep -E '.note.nonalloc\s+NOTE.+000008 00      0   0  1' $t/log1
 
-readelf --segments $t/exe > $t/log
-grep -Fq '01     .note.bar .note.baz .note.foo' $t/log
-! grep -q 'NOTE.*0x0000000000000000 0x0000000000000000' $t/log || false
+readelf --segments $t/exe > $t/log2
+grep -F '01     .note.baz .note.foo .note.bar' $t/log2
+not grep 'NOTE.*0x0000000000000000 0x0000000000000000' $t/log2
diff --git a/test/arch-x86_64-note2.sh b/test/arch-x86_64-note2.sh
index e2bb303673..ed94108f4a 100755
--- a/test/arch-x86_64-note2.sh
+++ b/test/arch-x86_64-note2.sh
@@ -28,5 +28,4 @@ EOF
 
 ./mold -o $t/exe $t/a.o $t/b.o $t/c.o $t/d.o
 
-readelf --segments $t/exe > $t/log
-grep -Fq '01     .note.a .note.b .note.c' $t/log
+readelf --segments $t/exe | grep -F '01     .note.a .note.c .note.b'
diff --git a/test/arch-x86_64-plt.sh b/test/arch-x86_64-plt.sh
index b8bddddf8f..7632f89cda 100755
--- a/test/arch-x86_64-plt.sh
+++ b/test/arch-x86_64-plt.sh
@@ -20,7 +20,7 @@ EOF
 
 $CC -B. -o $t/exe $t/a.o
 
-readelf --sections $t/exe | grep -Fq '.got'
-readelf --sections $t/exe | grep -Fq '.got.plt'
+readelf --sections $t/exe | grep -F '.got'
+readelf --sections $t/exe | grep -F '.got.plt'
 
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
diff --git a/test/arch-x86_64-preinit-array.sh b/test/arch-x86_64-preinit-array.sh
index bb326e4004..c5f3ae53f7 100755
--- a/test/arch-x86_64-preinit-array.sh
+++ b/test/arch-x86_64-preinit-array.sh
@@ -30,4 +30,4 @@ int main() {}
 EOF
 
 $CC -B. -o $t/exe $t/a.o $t/b.o
-$QEMU $t/exe | grep -q 'preinit init fini'
+$QEMU $t/exe | grep 'preinit init fini'
diff --git a/test/arch-x86_64-relax.sh b/test/arch-x86_64-relax.sh
index bf8b75e59e..2823139b43 100755
--- a/test/arch-x86_64-relax.sh
+++ b/test/arch-x86_64-relax.sh
@@ -38,20 +38,20 @@ EOF
 $CC -B. -o $t/exe $t/a.o $t/b.o
 $OBJDUMP -d $t/exe | grep -A20 '<bar>:' > $t/log
 
-grep -Eq 'lea \s*0x.+\(%rip\),%rax .*<foo>' $t/log
-grep -Eq 'lea \s*0x.+\(%rip\),%rcx .*<foo>' $t/log
-grep -Eq 'lea \s*0x.+\(%rip\),%rdx .*<foo>' $t/log
-grep -Eq 'lea \s*0x.+\(%rip\),%rbx .*<foo>' $t/log
-grep -Eq 'lea \s*0x.+\(%rip\),%rbp .*<foo>' $t/log
-grep -Eq 'lea \s*0x.+\(%rip\),%rsi .*<foo>' $t/log
-grep -Eq 'lea \s*0x.+\(%rip\),%rdi .*<foo>' $t/log
-grep -Eq 'lea \s*0x.+\(%rip\),%r8  .*<foo>' $t/log
-grep -Eq 'lea \s*0x.+\(%rip\),%r9  .*<foo>' $t/log
-grep -Eq 'lea \s*0x.+\(%rip\),%r10 .*<foo>' $t/log
-grep -Eq 'lea \s*0x.+\(%rip\),%r11 .*<foo>' $t/log
-grep -Eq 'lea \s*0x.+\(%rip\),%r12 .*<foo>' $t/log
-grep -Eq 'lea \s*0x.+\(%rip\),%r13 .*<foo>' $t/log
-grep -Eq 'lea \s*0x.+\(%rip\),%r14 .*<foo>' $t/log
-grep -Eq 'lea \s*0x.+\(%rip\),%r15 .*<foo>' $t/log
-grep -Eq 'call.*<foo>' $t/log
-grep -Eq 'jmp.*<foo>' $t/log
+grep -E 'lea \s*0x.+\(%rip\),%rax .*<foo>' $t/log
+grep -E 'lea \s*0x.+\(%rip\),%rcx .*<foo>' $t/log
+grep -E 'lea \s*0x.+\(%rip\),%rdx .*<foo>' $t/log
+grep -E 'lea \s*0x.+\(%rip\),%rbx .*<foo>' $t/log
+grep -E 'lea \s*0x.+\(%rip\),%rbp .*<foo>' $t/log
+grep -E 'lea \s*0x.+\(%rip\),%rsi .*<foo>' $t/log
+grep -E 'lea \s*0x.+\(%rip\),%rdi .*<foo>' $t/log
+grep -E 'lea \s*0x.+\(%rip\),%r8  .*<foo>' $t/log
+grep -E 'lea \s*0x.+\(%rip\),%r9  .*<foo>' $t/log
+grep -E 'lea \s*0x.+\(%rip\),%r10 .*<foo>' $t/log
+grep -E 'lea \s*0x.+\(%rip\),%r11 .*<foo>' $t/log
+grep -E 'lea \s*0x.+\(%rip\),%r12 .*<foo>' $t/log
+grep -E 'lea \s*0x.+\(%rip\),%r13 .*<foo>' $t/log
+grep -E 'lea \s*0x.+\(%rip\),%r14 .*<foo>' $t/log
+grep -E 'lea \s*0x.+\(%rip\),%r15 .*<foo>' $t/log
+grep -E 'call.*<foo>' $t/log
+grep -E 'jmp.*<foo>' $t/log
diff --git a/test/arch-x86_64-reloc-overflow.sh b/test/arch-x86_64-reloc-overflow.sh
index 7d741b8352..6013cf32e3 100755
--- a/test/arch-x86_64-reloc-overflow.sh
+++ b/test/arch-x86_64-reloc-overflow.sh
@@ -8,5 +8,5 @@ foo:
   .short foo
 EOF
 
-! ./mold -e foo -o $t/exe $t/a.o 2> $t/log || false
-grep -Fq 'relocation R_X86_64_16 against foo out of range' $t/log
+not ./mold -e foo -o $t/exe $t/a.o |&
+  grep -F 'relocation R_X86_64_16 against foo out of range'
diff --git a/test/arch-x86_64-reloc.sh b/test/arch-x86_64-reloc.sh
index f00d9dbd1b..070346a53b 100755
--- a/test/arch-x86_64-reloc.sh
+++ b/test/arch-x86_64-reloc.sh
@@ -41,9 +41,9 @@ main:
 EOF
 
 $CC -B. -o $t/exe $t/c.so $t/d.s -no-pie
-$QEMU $t/exe | grep -q '^42$'
+$QEMU $t/exe | grep '^42$'
 $CC -B. -o $t/exe $t/c.so $t/d.s -pie
-$QEMU $t/exe | grep -q '^42$'
+$QEMU $t/exe | grep '^42$'
 
 # GOT
 cat <<'EOF' > $t/d.s
@@ -58,9 +58,9 @@ main:
 EOF
 
 $CC -B. -o $t/exe $t/c.so $t/d.s -no-pie
-$QEMU $t/exe | grep -q '^56$'
+$QEMU $t/exe | grep '^56$'
 $CC -B. -o $t/exe $t/c.so $t/d.s -pie
-$QEMU $t/exe | grep -q '^56$'
+$QEMU $t/exe | grep '^56$'
 
 # Copyrel
 cat <<'EOF' > $t/d.s
@@ -75,9 +75,9 @@ EOF
 
 $CC -c -o $t/d.o $t/d.s
 $CC -B. -o $t/exe $t/c.so $t/d.o -no-pie
-$QEMU $t/exe | grep -q '^56$'
+$QEMU $t/exe | grep '^56$'
 $CC -B. -o $t/exe $t/c.so $t/d.s -pie
-$QEMU $t/exe | grep -q '^56$'
+$QEMU $t/exe | grep '^56$'
 
 # Copyrel
 cat <<'EOF' > $t/d.s
@@ -96,9 +96,9 @@ foo:
 EOF
 
 $CC -B. -o $t/exe $t/c.so $t/d.s -no-pie
-$QEMU $t/exe | grep -q '^56$'
+$QEMU $t/exe | grep '^56$'
 $CC -B. -o $t/exe $t/c.so $t/d.s -pie
-$QEMU $t/exe | grep -q '^56$'
+$QEMU $t/exe | grep '^56$'
 
 # PLT
 cat <<'EOF' > $t/d.s
@@ -112,9 +112,9 @@ main:
 EOF
 
 $CC -B. -o $t/exe $t/c.so $t/d.s -no-pie
-$QEMU $t/exe | grep -q '^76$'
+$QEMU $t/exe | grep '^76$'
 $CC -B. -o $t/exe $t/c.so $t/d.s -pie
-$QEMU $t/exe | grep -q '^76$'
+$QEMU $t/exe | grep '^76$'
 
 # PLT
 cat <<'EOF' > $t/d.s
@@ -129,9 +129,9 @@ main:
 EOF
 
 $CC -B. -o $t/exe $t/c.so $t/d.s -no-pie
-$QEMU $t/exe | grep -q '^76$'
+$QEMU $t/exe | grep '^76$'
 $CC -B. -o $t/exe $t/c.so $t/d.s -pie
-$QEMU $t/exe | grep -q '^76$'
+$QEMU $t/exe | grep '^76$'
 
 # SIZE32
 cat <<'EOF' > $t/d.s
@@ -151,7 +151,7 @@ foo:
 EOF
 
 $CC -B. -o $t/exe $t/c.so $t/d.s
-$QEMU $t/exe | grep -q '^26$'
+$QEMU $t/exe | grep '^26$'
 
 # SIZE64
 cat <<'EOF' > $t/d.s
@@ -171,7 +171,7 @@ foo:
 EOF
 
 $CC -B. -o $t/exe $t/c.so $t/d.s
-$QEMU $t/exe | grep -q '^61$'
+$QEMU $t/exe | grep '^61$'
 
 # GOTPCREL64
 cat <<'EOF' > $t/e.c
@@ -186,7 +186,7 @@ EOF
 
 $CC -c -o $t/e.o $t/e.c -mcmodel=large -fPIC
 $CC -B. -o $t/exe $t/c.so $t/e.o
-$QEMU $t/exe | grep -q '^56000003$'
+$QEMU $t/exe | grep '^56000003$'
 
 # R_X86_64_32 against non-alloc section
 cat <<'EOF' > $t/f.s
@@ -211,5 +211,5 @@ $CC -c -o $t/f.o $t/f.s
 $CC -B. -o $t/exe $t/f.o
 readelf -x .foo -x .bar $t/exe > $t/log
 
-grep -Fq '0x00000010 00000000 00000000 10000000 00000000' $t/log
-grep -Fq '0x00000010 18000000 00000000' $t/log
+grep -F '0x00000010 00000000 00000000 10000000 00000000' $t/log
+grep -F '0x00000010 18000000 00000000' $t/log
diff --git a/test/arch-x86_64-section-alignment.sh b/test/arch-x86_64-section-alignment.sh
index e4a87f0902..7f174d10c5 100755
--- a/test/arch-x86_64-section-alignment.sh
+++ b/test/arch-x86_64-section-alignment.sh
@@ -33,4 +33,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o
-$QEMU $t/exe | grep -q '^0 0 0$'
+$QEMU $t/exe | grep '^0 0 0$'
diff --git a/test/arch-x86_64-section-name.sh b/test/arch-x86_64-section-name.sh
index 8fa06e9019..fb533a637f 100755
--- a/test/arch-x86_64-section-name.sh
+++ b/test/arch-x86_64-section-name.sh
@@ -55,18 +55,18 @@ EOF
 
 ./mold -o $t/exe $t/a.o -z keep-text-section-prefix
 
-readelf -p .text.hot $t/exe | grep -Fq '.text.hot .text.hot.foo'
-readelf -p .text.unknown $t/exe | grep -Fq '.text.unknown .text.unknown.foo'
-readelf -p .text.unlikely $t/exe | grep -Fq '.text.unlikely .text.unlikely.foo'
-readelf -p .text.startup $t/exe | grep -Fq '.text.startup .text.startup.foo'
-readelf -p .text.exit $t/exe | grep -Fq '.text.exit .text.exit.foo'
-readelf -p .text $t/exe | grep -Fq '.text .text.foo'
-readelf -p .data.rel.ro $t/exe | grep -Fq '.data.rel.ro .data.rel.ro.foo'
-readelf -p .data $t/exe | grep -Fq '.data .data.foo'
-readelf -p .rodata $t/exe | grep -Fq '.rodata .rodata.foo'
+readelf -p .text.hot $t/exe | grep -F '.text.hot .text.hot.foo'
+readelf -p .text.unknown $t/exe | grep -F '.text.unknown .text.unknown.foo'
+readelf -p .text.unlikely $t/exe | grep -F '.text.unlikely .text.unlikely.foo'
+readelf -p .text.startup $t/exe | grep -F '.text.startup .text.startup.foo'
+readelf -p .text.exit $t/exe | grep -F '.text.exit .text.exit.foo'
+readelf -p .text $t/exe | grep -F '.text .text.foo'
+readelf -p .data.rel.ro $t/exe | grep -F '.data.rel.ro .data.rel.ro.foo'
+readelf -p .data $t/exe | grep -F '.data .data.foo'
+readelf -p .rodata $t/exe | grep -F '.rodata .rodata.foo'
 
 ./mold -o $t/exe $t/a.o
-! readelf --sections $t/exe | grep -Fq .text.hot || false
+readelf --sections $t/exe | not grep -F .text.hot
 
 ./mold -o $t/exe $t/a.o -z nokeep-text-section-prefix
-! readelf --sections $t/exe | grep -Fq .text.hot || false
+readelf --sections $t/exe | not grep -F .text.hot
diff --git a/test/arch-x86_64-tbss-only.sh b/test/arch-x86_64-tbss-only.sh
index 6ebdb45313..9e510d2f1d 100755
--- a/test/arch-x86_64-tbss-only.sh
+++ b/test/arch-x86_64-tbss-only.sh
@@ -2,7 +2,7 @@
 . $(dirname $0)/common.inc
 
 # Test if grep supports backreferences
-echo abab | grep -Eq '(ab)\1' || skip
+echo abab | grep -E '(ab)\1' || skip
 
 cat <<EOF | $CC -o $t/a.o -c -xc -
 __thread char foo;
@@ -16,4 +16,4 @@ EOF
 $CC -B. -o $t/exe $t/a.o
 $QEMU $t/exe
 
-readelf -W --segments $t/exe | grep -Eq 'TLS +0x000([^ ][^ ][^ ]) 0x[^ ]+\1 '
+readelf -W --segments $t/exe | grep -E 'TLS +0x000([^ ][^ ][^ ]) 0x[^ ]+\1 '
diff --git a/test/arch-x86_64-tls-gd-mcmodel-large.sh b/test/arch-x86_64-tls-gd-mcmodel-large.sh
index f4e1b87d44..0163c8ba5b 100755
--- a/test/arch-x86_64-tls-gd-mcmodel-large.sh
+++ b/test/arch-x86_64-tls-gd-mcmodel-large.sh
@@ -35,7 +35,7 @@ $CC -B. -shared -o $t/d.so $t/b.o -mcmodel=large
 $CC -B. -shared -o $t/e.so $t/c.o -Wl,--no-relax -mcmodel=large
 
 $CC -B. -o $t/exe $t/a.o $t/d.so $t/e.so -mcmodel=large
-$QEMU $t/exe | grep -q '1 2 3 4 5 6'
+$QEMU $t/exe | grep '1 2 3 4 5 6'
 
 $CC -B. -o $t/exe $t/a.o $t/d.so $t/e.so -Wl,-no-relax -mcmodel=large
-$QEMU $t/exe | grep -q '1 2 3 4 5 6'
+$QEMU $t/exe | grep '1 2 3 4 5 6'
diff --git a/test/arch-x86_64-tls-gd-to-ie.sh b/test/arch-x86_64-tls-gd-to-ie.sh
index 257d52bf7d..b2f77bf00a 100755
--- a/test/arch-x86_64-tls-gd-to-ie.sh
+++ b/test/arch-x86_64-tls-gd-to-ie.sh
@@ -23,16 +23,16 @@ EOF
 
 $CC -B. -shared -o $t/c.so $t/a.o
 $CC -B. -o $t/exe1 $t/b.o $t/c.so
-$QEMU $t/exe1 | grep -q '1 2 3'
+$QEMU $t/exe1 | grep '1 2 3'
 
 $CC -B. -shared -o $t/d.so $t/a.o -Wl,-no-relax
 $CC -B. -o $t/exe2 $t/b.o $t/d.so
-$QEMU $t/exe2 | grep -q '1 2 3'
+$QEMU $t/exe2 | grep '1 2 3'
 
 $CC -B. -shared -o $t/e.so $t/a.o -Wl,-z,nodlopen
 $CC -B. -o $t/exe3 $t/b.o $t/e.so
-$QEMU $t/exe3 | grep -q '1 2 3'
+$QEMU $t/exe3 | grep '1 2 3'
 
 $CC -B. -shared -o $t/f.so $t/a.o -Wl,-z,nodlopen -Wl,-no-relax
 $CC -B. -o $t/exe4 $t/b.o $t/f.so
-$QEMU $t/exe4 | grep -q '1 2 3'
+$QEMU $t/exe4 | grep '1 2 3'
diff --git a/test/arch-x86_64-tls-large-tbss.sh b/test/arch-x86_64-tls-large-tbss.sh
index 3ae98b98a7..d6c203a9a1 100755
--- a/test/arch-x86_64-tls-large-tbss.sh
+++ b/test/arch-x86_64-tls-large-tbss.sh
@@ -25,4 +25,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o $t/b.o
-$QEMU $t/exe | grep -q '^3 0 5 0 0 0$'
+$QEMU $t/exe | grep '^3 0 5 0 0 0$'
diff --git a/test/arch-x86_64-tls-ld-mcmodel-large.sh b/test/arch-x86_64-tls-ld-mcmodel-large.sh
index 9e3b0863d2..fdb216591f 100755
--- a/test/arch-x86_64-tls-ld-mcmodel-large.sh
+++ b/test/arch-x86_64-tls-ld-mcmodel-large.sh
@@ -23,7 +23,7 @@ _Thread_local int foo = 3;
 EOF
 
 $CC -B. -o $t/exe $t/a.o $t/b.o -mcmodel=large
-$QEMU $t/exe | grep -q '3 5 3 5'
+$QEMU $t/exe | grep '3 5 3 5'
 
 $CC -B. -o $t/exe $t/a.o $t/b.o -Wl,-no-relax -mcmodel=large
-$QEMU $t/exe | grep -q '3 5 3 5'
+$QEMU $t/exe | grep '3 5 3 5'
diff --git a/test/arch-x86_64-tls-module-base.sh b/test/arch-x86_64-tls-module-base.sh
index 830f0297f4..19ce0acd33 100755
--- a/test/arch-x86_64-tls-module-base.sh
+++ b/test/arch-x86_64-tls-module-base.sh
@@ -33,11 +33,11 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o
-$QEMU $t/exe1 | grep -q '^20 3$'
+$QEMU $t/exe1 | grep '^20 3$'
 
 $CC -B. -o $t/exe2 $t/a.o $t/b.o $t/c.o -Wl,-no-relax
-$QEMU $t/exe2 | grep -q '^20 3$'
+$QEMU $t/exe2 | grep '^20 3$'
 
 $CC -B. -o $t/d.so $t/a.o -shared
 $CC -B. -o $t/exe3 $t/b.o $t/c.o $t/d.so
-$QEMU $t/exe3 | grep -q '^20 3$'
+$QEMU $t/exe3 | grep '^20 3$'
diff --git a/test/arch-x86_64-tlsdesc.sh b/test/arch-x86_64-tlsdesc.sh
index 9160371b95..84818cc36a 100755
--- a/test/arch-x86_64-tlsdesc.sh
+++ b/test/arch-x86_64-tlsdesc.sh
@@ -33,15 +33,15 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o $t/b.o
-$QEMU $t/exe1 | grep -q 42
+$QEMU $t/exe1 | grep 42
 
 $CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,-no-relax
-$QEMU $t/exe2 | grep -q 42
+$QEMU $t/exe2 | grep 42
 
 $CC -B. -shared -o $t/c.so $t/a.o
 $CC -B. -o $t/exe3 $t/b.o $t/c.so
-$QEMU $t/exe3 | grep -q 42
+$QEMU $t/exe3 | grep 42
 
 $CC -B. -shared -o $t/c.so $t/a.o -Wl,-no-relax
 $CC -B. -o $t/exe4 $t/b.o $t/c.so -Wl,-no-relax
-$QEMU $t/exe4 | grep -q 42
+$QEMU $t/exe4 | grep 42
diff --git a/test/arch-x86_64-unique.sh b/test/arch-x86_64-unique.sh
index ecbcda1b4b..2914b9a8ef 100755
--- a/test/arch-x86_64-unique.sh
+++ b/test/arch-x86_64-unique.sh
@@ -20,6 +20,6 @@ EOF
 
 $CC -B. -o $t/exe $t/a.o -nostdlib -Wl,-unique='*foo*'
 
-readelf -x .data.foo.1 $t/exe | grep -q ab
-readelf -x .data.foo.2 $t/exe | grep -q c
-readelf -x .data $t/exe | grep -q de
+readelf -x .data.foo.1 $t/exe | grep ab
+readelf -x .data.foo.2 $t/exe | grep c
+readelf -x .data $t/exe | grep de
diff --git a/test/arch-x86_64-warn-execstack.sh b/test/arch-x86_64-warn-execstack.sh
index aaf6c2447e..7656f371d7 100755
--- a/test/arch-x86_64-warn-execstack.sh
+++ b/test/arch-x86_64-warn-execstack.sh
@@ -9,4 +9,4 @@ cat <<EOF | $CC -o $t/b.o -c -xc -
 int main() {}
 EOF
 
-$GCC -B. -o $t/exe $t/a.o $t/b.o 2>&1 | grep -Eq 'may cause a segmentation fault|requires executable stack'
+$GCC -B. -o $t/exe $t/a.o $t/b.o |& grep -E 'may cause a segmentation fault|requires executable stack'
diff --git a/test/arch-x86_64-warn-shared-textrel.sh b/test/arch-x86_64-warn-shared-textrel.sh
index 3c5a6da956..6a30303908 100755
--- a/test/arch-x86_64-warn-shared-textrel.sh
+++ b/test/arch-x86_64-warn-shared-textrel.sh
@@ -17,5 +17,5 @@ int main() { fn(); }
 EOF
 
 $CC -B. -shared -o $t/c.so $t/a.o $t/b.o -Wl,-warn-shared-textrel >& $t/log
-grep -q 'relocation against symbol `main'\'' in read-only section' $t/log
-grep -q 'creating a DT_TEXTREL in an output file' $t/log
+grep 'relocation against symbol `main'\'' in read-only section' $t/log
+grep 'creating a DT_TEXTREL in an output file' $t/log
diff --git a/test/arch-x86_64-warn-textrel.sh b/test/arch-x86_64-warn-textrel.sh
index 031cdcdf29..906f1e9bd4 100755
--- a/test/arch-x86_64-warn-textrel.sh
+++ b/test/arch-x86_64-warn-textrel.sh
@@ -17,5 +17,5 @@ int main() { fn(); }
 EOF
 
 $CC -B. -o $t/exe $t/a.o $t/b.o -pie -Wl,-warn-textrel >& $t/log
-grep -q 'relocation against symbol `main'\'' in read-only section' $t/log
-grep -q 'creating a DT_TEXTREL in an output file' $t/log
+grep 'relocation against symbol `main'\'' in read-only section' $t/log
+grep 'creating a DT_TEXTREL in an output file' $t/log
diff --git a/test/arch-x86_64-z-dynamic-undefined-weak.sh b/test/arch-x86_64-z-dynamic-undefined-weak.sh
new file mode 100755
index 0000000000..4890f92135
--- /dev/null
+++ b/test/arch-x86_64-z-dynamic-undefined-weak.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+[ "$(uname)" = FreeBSD ] && skip
+
+cat <<EOF | $CC -o $t/b.o -c -xc - -fno-PIE
+#include <stdio.h>
+__attribute__((weak)) extern int foo;
+int main() { printf("%p\n", &foo); }
+EOF
+
+not $CC -B. -o $t/exe3 $t/b.o -no-pie -Wl,-z,dynamic-undefined-weak |&
+  grep 'cannot create a copy relocation for foo'
diff --git a/test/arch-x86_64-z-ibt.sh b/test/arch-x86_64-z-ibt.sh
index 9bb5f066ed..79c8c0a4e0 100755
--- a/test/arch-x86_64-z-ibt.sh
+++ b/test/arch-x86_64-z-ibt.sh
@@ -10,8 +10,7 @@ main:
 EOF
 
 $CC -B. -o $t/exe $t/a.o
-readelf --notes $t/exe > $t/log
-! grep -qw SHSTK $t/log
+readelf --notes $t/exe | not grep -w SHSTK
 
 $CC -B. -o $t/exe $t/a.o -Wl,-z,ibt
-readelf --notes $t/exe | grep -qw IBT
+readelf --notes $t/exe | grep -w IBT
diff --git a/test/arch-x86_64-z-ibtplt.sh b/test/arch-x86_64-z-ibtplt.sh
index 3f8be5760e..789f12f1bd 100755
--- a/test/arch-x86_64-z-ibtplt.sh
+++ b/test/arch-x86_64-z-ibtplt.sh
@@ -24,4 +24,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/c.o $t/b.so -Wl,-z,ibtplt
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
diff --git a/test/arch-x86_64-z-rewrite-endbr.sh b/test/arch-x86_64-z-rewrite-endbr.sh
index 0a04ffdcb9..59ae1cd68a 100755
--- a/test/arch-x86_64-z-rewrite-endbr.sh
+++ b/test/arch-x86_64-z-rewrite-endbr.sh
@@ -15,13 +15,13 @@ EOF
 $CC -B. -o $t/exe1 $t/a.o $t/b.o
 $OBJDUMP -dr $t/exe1 > $t/log1
 
-grep -A1 '<foo>:' $t/log1 | grep -q endbr64
-grep -A1 '<bar>:' $t/log1 | grep -q endbr64
-grep -A1 '<main>:' $t/log1 | grep -q endbr64
+grep -A1 '<foo>:' $t/log1 | grep endbr64
+grep -A1 '<bar>:' $t/log1 | grep endbr64
+grep -A1 '<main>:' $t/log1 | grep endbr64
 
 $CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,-z,rewrite-endbr
 $OBJDUMP -dr $t/exe2 > $t/log2
 
-grep -A1 '<foo>:' $t/log2 | grep -q nop
-grep -A1 '<bar>:' $t/log2 | grep -q nop
-grep -A1 '<main>:' $t/log2 | grep -q endbr64
+grep -A1 '<foo>:' $t/log2 | grep nop
+grep -A1 '<bar>:' $t/log2 | grep nop
+grep -A1 '<main>:' $t/log2 | grep endbr64
diff --git a/test/arch-x86_64-z-rewrite-endbr2.sh b/test/arch-x86_64-z-rewrite-endbr2.sh
index 52d1445b72..984f701184 100755
--- a/test/arch-x86_64-z-rewrite-endbr2.sh
+++ b/test/arch-x86_64-z-rewrite-endbr2.sh
@@ -15,13 +15,13 @@ EOF
 $CC -B. -o $t/exe1 $t/a.o $t/b.o
 $OBJDUMP -dr $t/exe1 > $t/log1
 
-grep -A1 '<foo>:' $t/log1 | grep -q endbr64
-grep -A1 '<bar>:' $t/log1 | grep -q endbr64
-grep -A1 '<main>:' $t/log1 | grep -q endbr64
+grep -A1 '<foo>:' $t/log1 | grep endbr64
+grep -A1 '<bar>:' $t/log1 | grep endbr64
+grep -A1 '<main>:' $t/log1 | grep endbr64
 
 $CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,-z,rewrite-endbr
 $OBJDUMP -dr $t/exe2 > $t/log2
 
-grep -A1 '<foo>:' $t/log2 | grep -q nop
-grep -A1 '<bar>:' $t/log2 | grep -q nop
-grep -A1 '<main>:' $t/log2 | grep -q endbr64
+grep -A1 '<foo>:' $t/log2 | grep nop
+grep -A1 '<bar>:' $t/log2 | grep nop
+grep -A1 '<main>:' $t/log2 | grep endbr64
diff --git a/test/arch-x86_64-z-rewrite-endbr3.sh b/test/arch-x86_64-z-rewrite-endbr3.sh
index f835854294..c9ab589c08 100755
--- a/test/arch-x86_64-z-rewrite-endbr3.sh
+++ b/test/arch-x86_64-z-rewrite-endbr3.sh
@@ -5,8 +5,7 @@ test_cflags -fcf-protection || skip
 [ "$QEMU" == '' ] || skip
 
 # Check if Intel SDE CPU emulator is available
-command -v sde >& /dev/null || skip
-sde --help | grep -q 'Software Development Emulator' || skip
+command -v sde64 >& /dev/null || skip
 
 cat <<EOF | $CC -o $t/a.o -c -xc - -O -fcf-protection
 #include <stdio.h>
@@ -16,4 +15,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o -Wl,-z,rewrite-endbr
-sde -cet 1 -- $t/exe | grep -q 'Hello world'
+sde64 -cet 1 -- $t/exe | grep 'Hello world'
diff --git a/test/arch-x86_64-z-shstk.sh b/test/arch-x86_64-z-shstk.sh
index 42c51439f3..d25106e678 100755
--- a/test/arch-x86_64-z-shstk.sh
+++ b/test/arch-x86_64-z-shstk.sh
@@ -9,8 +9,7 @@ main:
 EOF
 
 $CC -B. -o $t/exe $t/a.o
-readelf --notes $t/exe > $t/log
-! grep -qw SHSTK $t/log
+readelf --notes $t/exe | not grep -w SHSTK
 
 $CC -B. -o $t/exe $t/a.o -Wl,-z,shstk
-readelf --notes $t/exe | grep -qw SHSTK
+readelf --notes $t/exe | grep -w SHSTK
diff --git a/test/arch-x86_64-z-text.sh b/test/arch-x86_64-z-text.sh
index 596576eb34..ab6e020be8 100755
--- a/test/arch-x86_64-z-text.sh
+++ b/test/arch-x86_64-z-text.sh
@@ -31,7 +31,7 @@ int main() {
 EOF
 
 $CC -B. -pie -o $t/exe $t/a.o $t/b.o
-$QEMU $t/exe | grep -q 3
+$QEMU $t/exe | grep 3
 
-readelf --dynamic $t/exe | grep -Fq '(TEXTREL)'
-readelf --dynamic $t/exe | grep -q '\(FLAGS\).*TEXTREL'
+readelf --dynamic $t/exe | grep -F '(TEXTREL)'
+readelf --dynamic $t/exe | grep '\(FLAGS\).*TEXTREL'
diff --git a/test/as-needed-dso.sh b/test/as-needed-dso.sh
index 60fd6bd691..e86cdd4f2a 100755
--- a/test/as-needed-dso.sh
+++ b/test/as-needed-dso.sh
@@ -17,5 +17,5 @@ EOF
 
 $CC -B. -o $t/exe $t/a.o -L$t -Wl,--as-needed -lbar -lfoo
 readelf -W --dynamic $t/exe > $t/log2
-grep -q libbar $t/log2
-grep -q libfoo $t/log2
+grep libbar $t/log2
+not grep libfoo $t/log2
diff --git a/test/as-needed-dso2.sh b/test/as-needed-dso2.sh
index f283997139..19026d355a 100755
--- a/test/as-needed-dso2.sh
+++ b/test/as-needed-dso2.sh
@@ -29,5 +29,5 @@ $CC -B. -shared -o $t/libbar.so $t/b.o
 $CC -B. -o $t/exe $t/c.o -L$t -Wl,--as-needed -lfoo -lbar
 
 readelf --dynamic $t/exe > $t/log
-! grep libfoo.so $t/log || false
-grep -q libbar.so $t/log
+not grep libfoo.so $t/log
+grep libbar.so $t/log
diff --git a/test/as-needed-weak.sh b/test/as-needed-weak.sh
index fc43230089..6eb66eae28 100755
--- a/test/as-needed-weak.sh
+++ b/test/as-needed-weak.sh
@@ -21,11 +21,11 @@ EOF
 $CC -B. -o $t/exe1 $t/a.o -Wl,-no-as-needed -L$t -lbar -lfoo
 
 readelf --dynamic $t/exe1 > $t/log1
-grep -Fq 'Shared library: [libfoo.so]' $t/log1
-grep -Fq 'Shared library: [libbar.so]' $t/log1
+grep -F 'Shared library: [libfoo.so]' $t/log1
+grep -F 'Shared library: [libbar.so]' $t/log1
 
 $CC -B. -o $t/exe2 $t/a.o -Wl,-as-needed -L$t -lbar -lfoo
 
 readelf --dynamic $t/exe2 > $t/log2
-grep -Fq 'Shared library: [libfoo.so]' $t/log2
-! grep -Fq 'Shared library: [libbar.so]' $t/log2 || false
+grep -F 'Shared library: [libfoo.so]' $t/log2
+not grep -F 'Shared library: [libbar.so]' $t/log2
diff --git a/test/as-needed.sh b/test/as-needed.sh
index 6d5448c8a5..d89a570023 100755
--- a/test/as-needed.sh
+++ b/test/as-needed.sh
@@ -19,11 +19,11 @@ EOF
 $CC -B. -o $t/exe $t/a.o -Wl,--no-as-needed $t/b.so $t/c.so
 
 readelf --dynamic $t/exe > $t/log
-grep -Fq 'Shared library: [libfoo.so]' $t/log
-grep -Fq 'Shared library: [libbar.so]' $t/log
+grep -F 'Shared library: [libfoo.so]' $t/log
+grep -F 'Shared library: [libbar.so]' $t/log
 
 $CC -B. -o $t/exe $t/a.o -Wl,--as-needed $t/b.so $t/c.so
 
 readelf --dynamic $t/exe > $t/log
-grep -Fq 'Shared library: [libfoo.so]' $t/log
-! grep -Fq 'Shared library: [libbar.so]' $t/log || false
+grep -F 'Shared library: [libfoo.so]' $t/log
+not grep -F 'Shared library: [libbar.so]' $t/log
diff --git a/test/auxiliary.sh b/test/auxiliary.sh
index 5fa66bf672..343b694968 100755
--- a/test/auxiliary.sh
+++ b/test/auxiliary.sh
@@ -11,5 +11,5 @@ EOF
 ./mold -o $t/b.so $t/a.o -auxiliary foo -f bar -shared
 
 readelf --dynamic $t/b.so > $t/log
-grep -Fq 'Auxiliary library: [foo]' $t/log
-grep -Fq 'Auxiliary library: [bar]' $t/log
+grep -F 'Auxiliary library: [foo]' $t/log
+grep -F 'Auxiliary library: [bar]' $t/log
diff --git a/test/bno-symbolic.sh b/test/bno-symbolic.sh
index 213a1cc825..1e1fddbfc0 100755
--- a/test/bno-symbolic.sh
+++ b/test/bno-symbolic.sh
@@ -3,7 +3,7 @@
 
 # GCC produces buggy code for this test case on s390x.
 # https://sourceware.org/bugzilla/show_bug.cgi?id=29655
-[ $MACHINE = s390x ] && $CC -v 2>&1 | grep -E '^gcc version 1[0-4]\.' && skip
+[ $MACHINE = s390x ] && $CC -v |& grep -E '^gcc version 1[0-5]\.' && skip
 
 cat <<EOF | $CC -c -fPIC -o$t/a.o -xc -
 int foo = 4;
@@ -33,4 +33,4 @@ int main() {
 EOF
 
 $CC -B. -no-pie -o $t/exe $t/c.o $t/b.so
-$QEMU $t/exe | grep -q '3 3 1'
+$QEMU $t/exe | grep '3 3 1'
diff --git a/test/bsymbolic-functions.sh b/test/bsymbolic-functions.sh
index 1289401cb0..a4c303dbcb 100755
--- a/test/bsymbolic-functions.sh
+++ b/test/bsymbolic-functions.sh
@@ -24,4 +24,4 @@ int main() {
 EOF
 
 $CC -B. -no-pie -o $t/exe $t/c.o $t/b.so
-$QEMU $t/exe | grep -q '3 3 0'
+$QEMU $t/exe | grep '3 3 0'
diff --git a/test/bsymbolic-non-weak-functions.sh b/test/bsymbolic-non-weak-functions.sh
index 36c19299cf..43b2ea7fd3 100755
--- a/test/bsymbolic-non-weak-functions.sh
+++ b/test/bsymbolic-non-weak-functions.sh
@@ -39,4 +39,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/c.o $t/b.so
-$QEMU $t/exe | grep -q '^3 3 3 3 3 7$'
+$QEMU $t/exe | grep '^3 3 3 3 3 7$'
diff --git a/test/bsymbolic-non-weak.sh b/test/bsymbolic-non-weak.sh
index 284a9970eb..44029e5e80 100755
--- a/test/bsymbolic-non-weak.sh
+++ b/test/bsymbolic-non-weak.sh
@@ -39,4 +39,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/c.o $t/b.so
-$QEMU $t/exe | grep -q '^3 3 3 3 4 7$'
+$QEMU $t/exe | grep '^3 3 3 3 4 7$'
diff --git a/test/bsymbolic.sh b/test/bsymbolic.sh
index 492d97f430..f4b212742d 100755
--- a/test/bsymbolic.sh
+++ b/test/bsymbolic.sh
@@ -20,4 +20,4 @@ int main() {
 EOF
 
 $CC -B. -no-pie -o $t/exe $t/c.o $t/b.so
-$QEMU $t/exe | grep -q '3 4'
+$QEMU $t/exe | grep '3 4'
diff --git a/test/build-id.sh b/test/build-id.sh
index d23109251c..0ca099a171 100755
--- a/test/build-id.sh
+++ b/test/build-id.sh
@@ -1,25 +1,25 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
-echo 'int main() { return 0; }' > $t/a.c
+echo 'int main() { return 0; }' | $CC -c -o $t/a.o -xc -
 
-$CC -B. -o $t/exe $t/a.c -Wl,-build-id
-readelf -n $t/exe | grep -qv 'GNU.*0x00000010.*NT_GNU_BUILD_ID'
+$CC -B. -o $t/exe1 $t/a.o -Wl,-build-id
+readelf -n $t/exe1 | grep 'GNU.*0x00000014.*NT_GNU_BUILD_ID'
 
-$CC -B. -o $t/exe $t/a.c -Wl,-build-id=uuid
-readelf -nW $t/exe | grep -Eq 'Build ID: ............4...[89abcdef]'
+$CC -B. -o $t/exe2 $t/a.o -Wl,-build-id=uuid
+readelf -nW $t/exe2 | grep -E 'Build ID: ............4...[89abcdef]'
 
-$CC -B. -o $t/exe $t/a.c -Wl,-build-id=md5
-readelf -n $t/exe | grep -q 'GNU.*0x00000010.*NT_GNU_BUILD_ID'
+$CC -B. -o $t/exe3 $t/a.o -Wl,-build-id=md5
+readelf -n $t/exe3 | grep 'GNU.*0x00000010.*NT_GNU_BUILD_ID'
 
-$CC -B. -o $t/exe $t/a.c -Wl,-build-id=sha1
-readelf -n $t/exe | grep -q 'GNU.*0x00000014.*NT_GNU_BUILD_ID'
+$CC -B. -o $t/exe4 $t/a.o -Wl,-build-id=sha1
+readelf -n $t/exe4 | grep 'GNU.*0x00000014.*NT_GNU_BUILD_ID'
 
-$CC -B. -o $t/exe $t/a.c -Wl,-build-id=sha256
-readelf -n $t/exe | grep -q 'GNU.*0x00000020.*NT_GNU_BUILD_ID'
+$CC -B. -o $t/exe5 $t/a.o -Wl,-build-id=sha256
+readelf -n $t/exe5 | grep 'GNU.*0x00000020.*NT_GNU_BUILD_ID'
 
-$CC -B. -o $t/exe $t/a.c -Wl,-build-id=fast
-readelf -n $t/exe | grep -q 'GNU.*0x00000020.*NT_GNU_BUILD_ID'
+$CC -B. -o $t/exe6 $t/a.o -Wl,-build-id=fast
+readelf -n $t/exe6 | grep 'GNU.*0x00000020.*NT_GNU_BUILD_ID'
 
-$CC -B. -o $t/exe $t/a.c -Wl,-build-id=0xdeadbeefdeadbeef
-readelf -n $t/exe | grep -q 'Build ID: deadbeefdeadbeef'
+$CC -B. -o $t/exe7 $t/a.o -Wl,-build-id=0xdeadbeefdeadbeef
+readelf -n $t/exe7 | grep 'Build ID: deadbeefdeadbeef'
diff --git a/test/canonical-plt.sh b/test/canonical-plt.sh
index 53188e0e2d..4fe10dea58 100755
--- a/test/canonical-plt.sh
+++ b/test/canonical-plt.sh
@@ -3,7 +3,7 @@
 
 # GCC produces buggy code for this test case on s390x.
 # https://sourceware.org/bugzilla/show_bug.cgi?id=29655
-[ $MACHINE = s390x ] && $CC -v 2>&1 | grep -E '^gcc version 1[0-4]\.' && skip
+[ $MACHINE = s390x ] && $CC -v |& grep -E '^gcc version 1[0-5]\.' && skip
 
 cat <<EOF | $CC -o $t/a.so -fPIC -shared -xc -
 void *foo() {
@@ -36,4 +36,4 @@ int main() {
 EOF
 
 $CC -B. -no-pie -o $t/exe $t/a.so $t/b.o $t/c.o
-$QEMU $t/exe | grep -q '^1 1 1$'
+$QEMU $t/exe | grep '^1 1 1$'
diff --git a/test/cmdline.sh b/test/cmdline.sh
index eaa0b48a7d..186b00ab77 100755
--- a/test/cmdline.sh
+++ b/test/cmdline.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
-{ ./mold -zfoo || true; } 2>&1 | grep -q 'unknown command line option: -zfoo'
-{ ./mold -z foo || true; } 2>&1 | grep -q 'unknown command line option: -z foo'
-{ ./mold -abcdefg || true; } 2>&1 | grep -q 'unknown command line option: -abcdefg'
-{ ./mold --abcdefg || true; } 2>&1 | grep -q 'unknown command line option: --abcdefg'
+not ./mold -zfoo |& grep 'unknown command line option: -zfoo'
+not ./mold -z foo |& grep 'unknown command line option: -z foo'
+not ./mold -abcdefg |& grep 'unknown command line option: -abcdefg'
+not ./mold --abcdefg |& grep 'unknown command line option: --abcdefg'
diff --git a/test/color-diagnostics.sh b/test/color-diagnostics.sh
index c4cb2e8f58..f2eece0554 100755
--- a/test/color-diagnostics.sh
+++ b/test/color-diagnostics.sh
@@ -6,14 +6,7 @@ int foo();
 int main() { foo(); }
 EOF
 
-! ./mold -o $t/exe $t/a.o --color-diagnostics 2> $t/log
-! grep -q $'\033' $t/log || false
-
-! ./mold -o $t/exe $t/a.o --color-diagnostics=always 2> $t/log
-grep -q $'\033' $t/log
-
-! ./mold -o $t/exe $t/a.o --color-diagnostics=never 2> $t/log
-! grep -q $'\033' $t/log || false
-
-! ./mold -o $t/exe $t/a.o --color-diagnostics=auto 2> $t/log
-! grep -q $'\033' $t/log || false
+not ./mold -o $t/exe $t/a.o --color-diagnostics |& not grep $'\033'
+not ./mold -o $t/exe $t/a.o --color-diagnostics=always |& grep $'\033'
+not ./mold -o $t/exe $t/a.o --color-diagnostics=never |& not grep $'\033'
+not ./mold -o $t/exe $t/a.o --color-diagnostics=auto |& not grep $'\033'
diff --git a/test/comment.sh b/test/comment.sh
index 5493f94607..47b5e0d9d8 100755
--- a/test/comment.sh
+++ b/test/comment.sh
@@ -6,5 +6,5 @@ int main() {}
 EOF
 
 $CC -B. -o $t/exe $t/a.o
-readelf -p .comment $t/exe | grep -q mold
-readelf -SW $t/exe | grep -Eq '\.comment.*\bMS\b'
+readelf -p .comment $t/exe | grep mold
+readelf -SW $t/exe | grep -E '\.comment.*\bMS\b'
diff --git a/test/common-archive.sh b/test/common-archive.sh
index 3e6d41f0cf..d9340ecc01 100755
--- a/test/common-archive.sh
+++ b/test/common-archive.sh
@@ -31,7 +31,7 @@ rm -f $t/e.a
 ar rcs $t/e.a $t/b.o $t/c.o $t/d.o
 
 $CC -B. -o $t/exe $t/a.o $t/e.a
-$QEMU $t/exe | grep -q '5 0 0 -1'
+$QEMU $t/exe | grep '5 0 0 -1'
 
 cat <<EOF | $CC -fcommon -xc -c -o $t/f.o -
 int bar = 0;
@@ -43,4 +43,4 @@ rm -f $t/f.a
 ar rcs $t/f.a $t/b.o $t/f.o
 
 $CC -B. -o $t/exe $t/a.o $t/f.a
-$QEMU $t/exe | grep -q '5 0 7 2'
+$QEMU $t/exe | grep '5 0 7 2'
diff --git a/test/common-ref.sh b/test/common-ref.sh
index c65a455d9b..0baa50f221 100755
--- a/test/common-ref.sh
+++ b/test/common-ref.sh
@@ -28,4 +28,4 @@ rm -f $t/e.a
 ar rcs $t/e.a $t/d.o
 
 $CC -B. -o $t/exe $t/a.o $t/c.a $t/e.a
-$QEMU $t/exe | grep -q 5
+$QEMU $t/exe | grep 5
diff --git a/test/common-symbols.sh b/test/common-symbols.sh
index 14d54a226b..b6f3ca4836 100755
--- a/test/common-symbols.sh
+++ b/test/common-symbols.sh
@@ -20,7 +20,7 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o $t/b.o
-$QEMU $t/exe | grep -q '0 5 42'
+$QEMU $t/exe | grep '0 5 42'
 
 readelf --sections $t/exe > $t/log
-grep -q '.common .*NOBITS' $t/log
+grep '.common .*NOBITS' $t/log
diff --git a/test/common.inc b/test/common.inc
index fdad9f2719..95fed2424c 100644
--- a/test/common.inc
+++ b/test/common.inc
@@ -52,17 +52,21 @@ else
   OBJCOPY="$TRIPLE-objcopy"
   STRIP="$TRIPLE-strip"
 
-  if [ $MACHINE = i686 ]; then
-    QEMU="qemu-i386 -L /usr/$TRIPLE"
-  else
-    QEMU="qemu-$MACHINE -L /usr/$TRIPLE"
-  fi
+  case $MACHINE in
+  i686)   QEMU="qemu-i386 -L /usr/$TRIPLE" ;;
+  sh4aeb) QEMU="qemu-sh4eb -L /usr/$TRIPLE" ;;
+  *)      QEMU="qemu-$MACHINE -L /usr/$TRIPLE" ;;
+  esac
 fi
 
+testname=$(basename "$0" .sh)
+t=$TESTDIR/$testname
+mkdir -p $t
+
 case $MACHINE in
 x86_64 | i686 | arm)
   tlsdesc_opt=-mtls-dialect=gnu2 ;;
-aarch64 | loongarch*)
+aarch64 | riscv* | loongarch*)
   tlsdesc_opt=-mtls-dialect=desc ;;
 esac
 
@@ -73,6 +77,10 @@ if [ "$(uname)" = FreeBSD ]; then
 fi
 
 # Common functions
+not() {
+  if "$@"; then return 1; else return 0; fi
+}
+
 test_cflags() {
   echo 'int main() {}' | $CC -B. "$@" -o /dev/null -xc - >& /dev/null
 }
@@ -82,12 +90,12 @@ test_cxxflags() {
 }
 
 is_musl() {
-  ldd --version 2>&1 | grep -q musl
+  { ldd --version; true; } |& grep musl > /dev/null
 }
 
 supports_ifunc() {
-  ! is_musl && \
-    echo 'void x() __attribute__((ifunc("y"))); void *y() { return 0; }' | \
+  ! is_musl &&
+    echo 'void x() __attribute__((ifunc("y"))); void *y() { return 0; }' |
     $CC -c -o /dev/null -xc - >& /dev/null
 }
 
@@ -98,11 +106,20 @@ supports_tlsdesc() {
   # FreeBSD's loader doesn't seem to support TLSDESC relocs in an executable
   [ "$(uname)" = FreeBSD ] && return 1
 
-  [ "$tlsdesc_opt" != '' ]
+  [ "$tlsdesc_opt" = '' ] && return 1
+
+  # TLSDESC may not be supported on old systems. Compile a DSO with
+  # it to see if it is actually supported.
+  echo '_Thread_local int x; int y() { return x; }' |
+    $CC -shared -fPIC -xc -o $t/tlsdesc.so $tlsdesc_opt - 2> /dev/null ||
+    return 1
+  echo 'int y(); int main() { y(); }' | $CC -xc -c -o $t/tlsdesc.o -
+  $CC -o $t/tlsdesc $t/tlsdesc.o $t/tlsdesc.so
+  $QEMU $t/tlsdesc 2> /dev/null
 }
 
 on_qemu() {
-  [ "$QEMU" != '' ] || grep -qw qemu /proc/cpuinfo 2> /dev/null
+  [ "$QEMU" != '' ] || grep -w qemu /proc/cpuinfo >& /dev/null
 }
 
 skip() {
@@ -127,8 +144,6 @@ trap 'on_error $LINENO' ERR
 trap on_exit EXIT
 
 # Print out the startup message
-testname=$(basename "$0" .sh)
 echo -n "Testing $testname ... "
-t=$TESTDIR/$testname
-mkdir -p $t
+set -o pipefail
 set -x
diff --git a/test/compress-debug-sections.sh b/test/compress-debug-sections.sh
index 03522a0b6f..70220905f4 100755
--- a/test/compress-debug-sections.sh
+++ b/test/compress-debug-sections.sh
@@ -9,5 +9,5 @@ EOF
 $CC -B. -o $t/exe $t/a.o -Wl,--compress-debug-sections=zlib
 
 readelf -WS $t/exe > $t/log
-grep -q '\.debug_info .* [Cx] ' $t/log
-grep -q '\.debug_str .* MS[Cx] ' $t/log
+grep '\.debug_info .* [Cx] ' $t/log
+grep '\.debug_str .* MS[Cx] ' $t/log
diff --git a/test/compressed-debug-info.sh b/test/compressed-debug-info.sh
index e287b6efe9..c5b5d8413c 100755
--- a/test/compressed-debug-info.sh
+++ b/test/compressed-debug-info.sh
@@ -17,4 +17,4 @@ EOF
 
 $CC -B. -o $t/exe $t/a.o $t/b.o
 dwarfdump $t/exe > /dev/null
-readelf --sections $t/exe | grep -Fq .debug_info
+readelf --sections $t/exe | grep -F .debug_info
diff --git a/test/copyrel-alignment.sh b/test/copyrel-alignment.sh
index 432179bd8b..03ab2ce631 100755
--- a/test/copyrel-alignment.sh
+++ b/test/copyrel-alignment.sh
@@ -25,12 +25,12 @@ EOF
 
 $CC -B. -o $t/exe1 $t/d.o $t/a.so -no-pie
 $QEMU $t/exe1 > /dev/null
-readelf -W --sections $t/exe1 | grep -q '\.copyrel.* 32$'
+readelf -W --sections $t/exe1 | grep '\.copyrel.* 32$'
 
 $CC -B. -o $t/exe2 $t/d.o $t/b.so -no-pie
 $QEMU $t/exe2 > /dev/null
-readelf -W --sections $t/exe2 | grep -q '\.copyrel.* 8$'
+readelf -W --sections $t/exe2 | grep '\.copyrel.* 8$'
 
 $CC -B. -o $t/exe3 $t/d.o $t/c.so -no-pie
 $QEMU $t/exe3 > /dev/null
-readelf -W --sections $t/exe3 | grep -q '\.copyrel.* 256$'
+readelf -W --sections $t/exe3 | grep '\.copyrel.* 256$'
diff --git a/test/copyrel-norelro.sh b/test/copyrel-norelro.sh
index 36b712e00c..dbfed07819 100755
--- a/test/copyrel-norelro.sh
+++ b/test/copyrel-norelro.sh
@@ -19,9 +19,7 @@ __attribute__((section (".data.rel.ro"))) char msg[100] = "Hello world";
 EOF
 
 $CC -B. $t/a.o $t/b.so -o $t/exe1 -no-pie -Wl,-z,relro
-readelf -W --sections $t/exe1 > $t/log1
-grep -Fq .copyrel.rel.ro $t/log1
+readelf -W --sections $t/exe1 | grep -F .copyrel.rel.ro
 
 $CC -B. $t/a.o $t/b.so -o $t/exe2 -no-pie -Wl,-z,norelro
-readelf -W --sections $t/exe2 > $t/log2
-! grep -Fq .copyrel.rel.ro $t/log2 || false
+readelf -W --sections $t/exe2 | not grep -F .copyrel.rel.ro
diff --git a/test/copyrel-protected.sh b/test/copyrel-protected.sh
index 0cd196c411..000a8e74d4 100755
--- a/test/copyrel-protected.sh
+++ b/test/copyrel-protected.sh
@@ -17,5 +17,5 @@ cat <<EOF | $CC -shared -o $t/b.so -xc -
 __attribute__((visibility("protected"))) int foo;
 EOF
 
-! $CC -B. $t/a.o $t/b.so -o $t/exe >& $t/log -no-pie || false
-grep -Fq 'cannot create a copy relocation for protected symbol' $t/log
+not $CC -B. $t/a.o $t/b.so -o $t/exe -no-pie |&
+  grep -F 'cannot create a copy relocation for protected symbol'
diff --git a/test/copyrel-relro.sh b/test/copyrel-relro.sh
index 3b16124001..3822bfd391 100755
--- a/test/copyrel-relro.sh
+++ b/test/copyrel-relro.sh
@@ -37,4 +37,4 @@ char readwrite[100] = "abc";
 EOF
 
 $CC -B. $t/a.o $t/b.so -o $t/exe -no-pie
-$QEMU $t/exe | grep -q '^sigsegv 0 1$'
+$QEMU $t/exe | grep '^sigsegv 0 1$'
diff --git a/test/copyrel-relro2.sh b/test/copyrel-relro2.sh
index 0470ad9de9..90a9eec955 100755
--- a/test/copyrel-relro2.sh
+++ b/test/copyrel-relro2.sh
@@ -37,4 +37,4 @@ char readwrite[100] = "abc";
 EOF
 
 $CC -B. $t/a.o $t/b.so -o $t/exe -no-pie
-$QEMU $t/exe | grep -q '^sigsegv 0 1$'
+$QEMU $t/exe | grep '^sigsegv 0 1$'
diff --git a/test/copyrel.sh b/test/copyrel.sh
index 365b051e72..c4fec90546 100755
--- a/test/copyrel.sh
+++ b/test/copyrel.sh
@@ -26,4 +26,4 @@ EOF
 
 $CC -B. -shared -o $t/c.so $t/c.o
 $CC -B. -no-pie -o $t/exe $t/a.o $t/b.o $t/c.so
-$QEMU $t/exe | grep -q '42 42 1'
+$QEMU $t/exe | grep '42 42 1'
diff --git a/test/ctors-in-init-array.sh b/test/ctors-in-init-array.sh
index ab048df616..497423bc90 100755
--- a/test/ctors-in-init-array.sh
+++ b/test/ctors-in-init-array.sh
@@ -52,4 +52,4 @@ int main() {}
 EOF
 
 $CC -B. -o $t/exe $t/a.o $t/b.o
-$QEMU $t/exe | grep -q 'ctor1 init1 ctor2 ctor3 ctor4 init2 fini2 dtor1 dtor2 dtor3 fini1 dtor4'
+$QEMU $t/exe | grep 'ctor1 init1 ctor2 ctor3 ctor4 init2 fini2 dtor1 dtor2 dtor3 fini1 dtor4'
diff --git a/test/dead-debug-sections.sh b/test/dead-debug-sections.sh
index 28c88f7aa3..410bef1231 100755
--- a/test/dead-debug-sections.sh
+++ b/test/dead-debug-sections.sh
@@ -26,6 +26,6 @@ int main() { printf("%s\n", msg); }
 EOF
 
 $CXX -o $t/exe $t/a.o $t/b.o $t/c.o -g
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
 
 dwarfdump $t/exe > /dev/null
diff --git a/test/debug-macro-section.sh b/test/debug-macro-section.sh
index 8ce796e987..21c36a50a8 100755
--- a/test/debug-macro-section.sh
+++ b/test/debug-macro-section.sh
@@ -18,5 +18,4 @@ int z()  { return A + B; }
 EOF
 
 $GCC -B. -o $t/exe $t/b.o $t/c.o
-$OBJDUMP --dwarf=macro $t/exe > $t/log
-! grep 'DW_MACRO_import -.* 0x0$' $t/log || false
+$OBJDUMP --dwarf=macro $t/exe | not grep 'DW_MACRO_import -.* 0x0$'
diff --git a/test/default-symver.sh b/test/default-symver.sh
index a7f5dd17b3..1acb32b5a3 100755
--- a/test/default-symver.sh
+++ b/test/default-symver.sh
@@ -6,8 +6,8 @@ void foo() {}
 EOF
 
 $CC -B. -o $t/b.so -shared $t/a.o -Wl,-default-symver
-readelf --dyn-syms $t/b.so | grep -q ' foo@@b\.so'
+readelf --dyn-syms $t/b.so | grep ' foo@@b\.so'
 
 $CC -B. -o $t/b.so -shared $t/a.o \
   -Wl,--soname=bar -Wl,-default-symver
-readelf --dyn-syms $t/b.so | grep -q ' foo@@bar'
+readelf --dyn-syms $t/b.so | grep ' foo@@bar'
diff --git a/test/defsym-lto.sh b/test/defsym-lto.sh
index d60b83df6b..24fe659049 100755
--- a/test/defsym-lto.sh
+++ b/test/defsym-lto.sh
@@ -18,4 +18,4 @@ EOF
 
 $CC -B. -flto -o $t/exe $t/a.o -Wl,-defsym,live_func=dead_func
 
-$QEMU $t/exe | grep -q "^OK$"
+$QEMU $t/exe | grep "^OK$"
diff --git a/test/defsym-missing-symbol.sh b/test/defsym-missing-symbol.sh
index a2aa54378e..326cc2e508 100755
--- a/test/defsym-missing-symbol.sh
+++ b/test/defsym-missing-symbol.sh
@@ -5,5 +5,5 @@ cat <<EOF | $CC -o $t/a.o -c -xc -
 int main() {}
 EOF
 
-! $CC -B. -o $t/exe $t/a.o -Wl,-defsym=foo=bar 2> $t/log
-grep -q 'undefined symbol: bar' $t/log
+not $CC -B. -o $t/exe $t/a.o -Wl,-defsym=foo=bar |&
+  grep 'undefined symbol: bar'
diff --git a/test/defsym.sh b/test/defsym.sh
index 7621bdbacb..42b6867864 100755
--- a/test/defsym.sh
+++ b/test/defsym.sh
@@ -19,4 +19,4 @@ EOF
 $CC -B. -o $t/exe $t/a.o -pie -Wl,-defsym=foo=16 \
   -Wl,-defsym=bar=0x2000 -Wl,-defsym=baz=print
 
-$QEMU $t/exe | grep -q '^Hello 0x10 0x2000$'
+$QEMU $t/exe | grep '^Hello 0x10 0x2000$'
diff --git a/test/defsym2.sh b/test/defsym2.sh
index b768c2be9e..2d8279fedc 100755
--- a/test/defsym2.sh
+++ b/test/defsym2.sh
@@ -6,4 +6,4 @@ void foo() {}
 EOF
 
 $CC -B. -o $t/b.so -shared -Wl,-defsym=bar=foo $t/a.o
-nm -D $t/b.so | grep -q 'bar' || false
+nm -D $t/b.so | grep 'bar'
diff --git a/test/demangle-cpp.sh b/test/demangle-cpp.sh
index d4db602d1b..4828f60e57 100755
--- a/test/demangle-cpp.sh
+++ b/test/demangle-cpp.sh
@@ -6,8 +6,7 @@ void _ZN2ns7versionEv();
 int main() { _ZN2ns7versionEv(); }
 EOF
 
-! $CC -B. -o $t/exe1 $t/a.o 2> $t/log || false
-grep -Fq 'ns::version()' $t/log
+not $CC -B. -o $t/exe1 $t/a.o |& grep -F 'ns::version()'
 
 cat <<'EOF' | $CC -c -o $t/b.o -xc -
 void _ZN2ns7versionEv();
@@ -15,5 +14,4 @@ int main() { _ZN2ns7versionEv(); }
 __attribute__((section(".comment"))) char str[] = "rustc version x.y.z\n";
 EOF
 
-! $CC -B. -o $t/exe2 $t/b.o 2> $t/log || false
-grep -Fq 'ns::versionv' $t/log
+not $CC -B. -o $t/exe2 $t/b.o |& grep -F 'ns::versionv'
diff --git a/test/demangle-rust.sh b/test/demangle-rust.sh
index 70080bc79a..29dcf4bc76 100755
--- a/test/demangle-rust.sh
+++ b/test/demangle-rust.sh
@@ -9,6 +9,5 @@ int main() {
 }
 EOF
 
-! $CC -B. -o $t/exe $t/a.o 2> $t/log || false
-
-grep -Fq '<core::slice::Iter<u8> as core::iter::iterator::Iterator>::rposition::<core::slice::memchr::memrchr::{closure#1}>::{closure#0}' $t/log
+not $CC -B. -o $t/exe $t/a.o |&
+  grep -F '<core::slice::Iter<u8> as core::iter::iterator::Iterator>::rposition::<core::slice::memchr::memrchr::{closure#1}>::{closure#0}'
diff --git a/test/demangle.sh b/test/demangle.sh
index f7a707bb6c..f52bc5f0bd 100755
--- a/test/demangle.sh
+++ b/test/demangle.sh
@@ -8,14 +8,14 @@ int main() {
 }
 EOF
 
-! $CC -B. -o $t/exe $t/a.o -Wl,-no-demangle 2> $t/log || false
-grep -q 'undefined symbol: _Z3fooii$' $t/log
+not $CC -B. -o $t/exe $t/a.o -Wl,-no-demangle |&
+  grep 'undefined symbol: _Z3fooii$'
 
-! $CC -B. -o $t/exe $t/a.o -Wl,-demangle 2> $t/log || false
-grep -Eq 'undefined symbol: foo\(int, int\)$' $t/log
+not $CC -B. -o $t/exe $t/a.o -Wl,-demangle |&
+  grep -E 'undefined symbol: foo\(int, int\)$'
 
-! $CC -B. -o $t/exe $t/a.o 2> $t/log || false
-grep -Eq 'undefined symbol: foo\(int, int\)$' $t/log
+not $CC -B. -o $t/exe $t/a.o |&
+  grep -E 'undefined symbol: foo\(int, int\)$'
 
 cat <<EOF | $CC -c -o $t/b.o -xc -
 extern int Pi;
@@ -24,5 +24,5 @@ int main() {
 }
 EOF
 
-! $CC -B. -o $t/exe $t/b.o -Wl,-demangle 2> $t/log || false
-grep -q 'undefined symbol: Pi$' $t/log
+not $CC -B. -o $t/exe $t/b.o -Wl,-demangle |&
+  grep 'undefined symbol: Pi$'
diff --git a/test/dependency-file-response-file.sh b/test/dependency-file-response-file.sh
index 967d097720..a0d4866e44 100755
--- a/test/dependency-file-response-file.sh
+++ b/test/dependency-file-response-file.sh
@@ -12,6 +12,6 @@ echo "$t/a.o -Wl,-dependency-file=$t/dep" > $t/rsp
 
 $CC -B. -o $t/exe @$t/rsp
 
-grep -q '/exe:.*/a.o ' $t/dep
-grep -q '/a.o:$' $t/dep
-! grep -q '^/tmp' $t/dep || false
+grep '/exe:.*/a.o ' $t/dep
+grep '/a.o:$' $t/dep
+not grep '^/tmp' $t/dep
diff --git a/test/dependency-file.sh b/test/dependency-file.sh
index 44c3ea8bf5..e2ac46203d 100755
--- a/test/dependency-file.sh
+++ b/test/dependency-file.sh
@@ -10,5 +10,5 @@ EOF
 
 $CC -B. -o $t/exe $t/a.o -Wl,-dependency-file=$t/dep
 
-grep -q  "dependency-file/exe:.*/a.o " $t/dep
-grep -q  ".*/a.o:$" $t/dep
+grep  "dependency-file/exe:.*/a.o " $t/dep
+grep  ".*/a.o:$" $t/dep
diff --git a/test/disable-new-dtags.sh b/test/disable-new-dtags.sh
index a414352952..ce8714ae70 100755
--- a/test/disable-new-dtags.sh
+++ b/test/disable-new-dtags.sh
@@ -6,10 +6,10 @@ void foo() {}
 EOF
 
 $CC -B. -shared -o $t/b.so $t/a.o -Wl,-rpath=/foo
-readelf --dynamic $t/b.so | grep -q 'RUNPATH.*/foo'
+readelf --dynamic $t/b.so | grep 'RUNPATH.*/foo'
 
 $CC -B. -shared -o $t/b.so $t/a.o -Wl,-rpath=/foo -Wl,-enable-new-dtags
-readelf --dynamic $t/b.so | grep -q 'RUNPATH.*/foo'
+readelf --dynamic $t/b.so | grep 'RUNPATH.*/foo'
 
 $CC -B. -shared -o $t/b.so $t/a.o -Wl,-rpath=/foo -Wl,-disable-new-dtags
-readelf --dynamic $t/b.so | grep -q 'RPATH.*/foo'
+readelf --dynamic $t/b.so | grep 'RPATH.*/foo'
diff --git a/test/discard.sh b/test/discard.sh
index b7628c7ecc..25faa23ac2 100755
--- a/test/discard.sh
+++ b/test/discard.sh
@@ -17,24 +17,24 @@ EOF
 
 ./mold -o $t/exe $t/a.o
 readelf --symbols $t/exe > $t/log
-grep -Fq _start $t/log
-grep -Fq foo $t/log
-grep -Fq .Lbar $t/log
+grep -F _start $t/log
+grep -F foo $t/log
+grep -F .Lbar $t/log
 
 ./mold -o $t/exe $t/a.o --discard-locals
 readelf --symbols $t/exe > $t/log
-grep -Fq _start $t/log
-grep -Fq foo $t/log
-! grep -Fq .Lbar $t/log || false
+grep -F _start $t/log
+grep -F foo $t/log
+not grep -F .Lbar $t/log
 
 ./mold -o $t/exe $t/a.o --discard-all
 readelf --symbols $t/exe > $t/log
-grep -Fq _start $t/log
-! grep -Fq foo $t/log || false
-! grep -Fq .Lbar $t/log || false
+grep -F _start $t/log
+not grep -F foo $t/log
+not grep -F .Lbar $t/log
 
 ./mold -o $t/exe $t/a.o --strip-all
 readelf --symbols $t/exe > $t/log
-! grep -Fq _start $t/log || false
-! grep -Fq foo $t/log || false
-! grep -Fq .Lbar $t/log || false
+not grep -F _start $t/log
+not grep -F foo $t/log
+not grep -F .Lbar $t/log
diff --git a/test/dso-undef.sh b/test/dso-undef.sh
index 059daebb21..37dbbad539 100755
--- a/test/dso-undef.sh
+++ b/test/dso-undef.sh
@@ -24,4 +24,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/b.so $t/d.a $t/e.o
-readelf --dyn-syms $t/exe | grep -q ' foo$'
+readelf --dyn-syms $t/exe | grep ' foo$'
diff --git a/test/dt-init.sh b/test/dt-init.sh
index f22e45b48a..b297048132 100755
--- a/test/dt-init.sh
+++ b/test/dt-init.sh
@@ -39,8 +39,8 @@ $CC -B. -o $t/exe2 $t/a.o $t/d.so
 $QEMU $t/exe1 > $t/log1
 $QEMU $t/exe2 > $t/log2
 
-! grep -q init $t/log1 || false
-! grep -q fini $t/log1 || false
+not grep init $t/log1
+not grep fini $t/log1
 
-grep -q init $t/log2
-grep -q fini $t/log2
+grep init $t/log2
+grep fini $t/log2
diff --git a/test/dt-needed.sh b/test/dt-needed.sh
index 088a36c3cf..e4c47587f5 100755
--- a/test/dt-needed.sh
+++ b/test/dt-needed.sh
@@ -14,13 +14,13 @@ int main() { foo(); }
 EOF
 
 $CC -B. -o $t/exe $t/b.o $t/libfoo.so
-readelf --dynamic $t/exe | grep -Fq 'Shared library: [libfoo]'
+readelf --dynamic $t/exe | grep -F 'Shared library: [libfoo]'
 
 $CC -B. -o $t/exe $t/b.o -L $t -lfoo
-readelf --dynamic $t/exe | grep -Fq 'Shared library: [libfoo]'
+readelf --dynamic $t/exe | grep -F 'Shared library: [libfoo]'
 
 $CC -B. -o $t/exe $t/b.o $t/libbar.so
-readelf --dynamic $t/exe | grep -Eq 'Shared library: \[.*dt-needed/libbar\.so\]'
+readelf --dynamic $t/exe | grep -E 'Shared library: \[.*dt-needed/libbar\.so\]'
 
 $CC -B. -o $t/exe $t/b.o -L$t -lbar
-readelf --dynamic $t/exe | grep -Fq 'Shared library: [libbar.so]'
+readelf --dynamic $t/exe | grep -F 'Shared library: [libbar.so]'
diff --git a/test/duplicate-error-archive.sh b/test/duplicate-error-archive.sh
index fa4db125f3..6a9a098950 100755
--- a/test/duplicate-error-archive.sh
+++ b/test/duplicate-error-archive.sh
@@ -15,7 +15,7 @@ EOF
 
 $CC -B. -o $t/exe $t/c.o $t/b.a $t/b.a
 
-! $CC -B. -o $t/exe $t/c.o -Wl,--push-state,--whole-archive \
-  $t/b.a $t/b.a -Wl,--pop-state 2> $t/log || false
+not $CC -B. -o $t/exe $t/c.o -Wl,--push-state,--whole-archive \
+  $t/b.a $t/b.a -Wl,--pop-state 2> $t/log
 
-grep -q 'duplicate symbol:.* foo' $t/log
+grep 'duplicate symbol:.* foo' $t/log
diff --git a/test/duplicate-error-gc-sections.sh b/test/duplicate-error-gc-sections.sh
new file mode 100755
index 0000000000..6be141f3ce
--- /dev/null
+++ b/test/duplicate-error-gc-sections.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+nm mold | grep '__tsan_init' && skip
+
+cat <<EOF | $CC -o $t/a.o -c -xc -
+void foo() {}
+EOF
+
+cat <<EOF | $CC -o $t/b.o -c -xc -
+int main() {}
+EOF
+
+not $CC -B. -o $t/exe1 $t/a.o $t/a.o $t/b.o |&
+  grep 'duplicate symbol.*: foo$'
+
+not $CC -B. -o $t/exe2 $t/a.o $t/a.o $t/b.o -Wl,-gc-sections |&
+  grep 'duplicate symbol.*: foo$'
diff --git a/test/duplicate-error.sh b/test/duplicate-error.sh
index 8c3189468b..f489a27eba 100755
--- a/test/duplicate-error.sh
+++ b/test/duplicate-error.sh
@@ -8,5 +8,5 @@ main:
   nop
 EOF
 
-! ./mold -o $t/exe $t/a.o $t/a.o 2> $t/log || false
-grep -q 'duplicate symbol: .*\.o: .*\.o: main' $t/log
+not ./mold -o $t/exe $t/a.o $t/a.o |&
+  grep 'duplicate symbol: .*\.o: .*\.o: main'
diff --git a/test/dynamic-dt-debug.sh b/test/dynamic-dt-debug.sh
index 7422e364a6..560e6c0326 100755
--- a/test/dynamic-dt-debug.sh
+++ b/test/dynamic-dt-debug.sh
@@ -7,12 +7,11 @@ EOF
 
 $CC -B. -o $t/exe $t/a.o
 readelf --dynamic $t/exe > $t/log
-grep -Fq '(DEBUG)' $t/log
+grep -F '(DEBUG)' $t/log
 
 cat <<EOF | $CC -o $t/b.o -c -xc -
 void foo() {}
 EOF
 
 $CC -B. -o $t/c.so $t/b.o -shared
-readelf --dynamic $t/c.so > $t/log
-! grep -Fq '(DEBUG)' $t/log || false
+readelf --dynamic $t/c.so | not grep -F '(DEBUG)'
diff --git a/test/dynamic-linker.sh b/test/dynamic-linker.sh
index f861fb560f..ae469cf4d1 100755
--- a/test/dynamic-linker.sh
+++ b/test/dynamic-linker.sh
@@ -7,13 +7,7 @@ _start:
 EOF
 
 ./mold -o $t/exe $t/a.o
-
-readelf --sections $t/exe > $t/log
-! grep -Fq .interp $t/log || false
-
-readelf --dynamic $t/exe > $t/log
+readelf --sections $t/exe | not grep -F .interp
 
 ./mold -o $t/exe $t/a.o --dynamic-linker=/foo/bar
-
-readelf --sections $t/exe > $t/log
-grep -Fq .interp $t/log
+readelf --sections $t/exe | grep -F .interp
diff --git a/test/dynamic-list-data.sh b/test/dynamic-list-data.sh
index b76c613aea..befa18624a 100755
--- a/test/dynamic-list-data.sh
+++ b/test/dynamic-list-data.sh
@@ -9,5 +9,5 @@ EOF
 
 $CC -B. -o $t/exe $t/a.o -Wl,-dynamic-list-data
 readelf -W --dyn-syms $t/exe > $t/log
-grep -wq foo $t/log
-! grep -wq bar $t/log || false
+grep -w foo $t/log
+not grep -w bar $t/log
diff --git a/test/dynamic-list.sh b/test/dynamic-list.sh
index 358896c09c..3357e1ff0f 100755
--- a/test/dynamic-list.sh
+++ b/test/dynamic-list.sh
@@ -10,8 +10,8 @@ EOF
 $CC -B. -o $t/exe $t/a.o
 
 readelf --dyn-syms $t/exe > $t/log
-! grep -q ' foo' $t/log || false
-! grep -q ' bar' $t/log || false
+not grep ' foo' $t/log
+not grep ' bar' $t/log
 
 cat <<EOF > $t/dyn
 { foo; bar; };
@@ -20,17 +20,17 @@ EOF
 $CC -B. -o $t/exe1 $t/a.o -Wl,-dynamic-list=$t/dyn
 
 readelf --dyn-syms $t/exe1 > $t/log1
-grep -q ' foo' $t/log1
-grep -q ' bar' $t/log1
+grep ' foo' $t/log1
+grep ' bar' $t/log1
 
 $CC -B. -o $t/exe2 $t/a.o -Wl,--export-dynamic-symbol-list=$t/dyn
 
 readelf --dyn-syms $t/exe2 > $t/log2
-grep -q ' foo' $t/log2
-grep -q ' bar' $t/log2
+grep ' foo' $t/log2
+grep ' bar' $t/log2
 
 $CC -B. -o $t/exe3 $t/a.o -Wl,--export-dynamic-symbol=foo,--export-dynamic-symbol=bar
 
 readelf --dyn-syms $t/exe3 > $t/log3
-grep -q ' foo' $t/log3
-grep -q ' bar' $t/log3
+grep ' foo' $t/log3
+grep ' bar' $t/log3
diff --git a/test/dynamic-list2.sh b/test/dynamic-list2.sh
index ab80d04313..e644a0ca58 100755
--- a/test/dynamic-list2.sh
+++ b/test/dynamic-list2.sh
@@ -14,8 +14,8 @@ EOF
 $CXX -B. -o $t/exe $t/a.o $t/b.o
 
 readelf --dyn-syms $t/exe > $t/log
-! grep -q ' foo' $t/log || false
-! grep -q ' bar' $t/log || false
+not grep ' foo' $t/log
+not grep ' bar' $t/log
 
 cat <<EOF > $t/dyn
 { foo; extern "C++" { "baz(int)"; }; };
@@ -24,13 +24,13 @@ EOF
 $CC -B. -o $t/exe1 $t/a.o $t/b.o -Wl,-dynamic-list=$t/dyn
 
 readelf --dyn-syms $t/exe1 > $t/log1
-grep -q ' foo' $t/log1
-! grep -q ' bar' $t/log1 || false
-grep -q ' _Z3bazi' $t/log1
+grep ' foo' $t/log1
+not grep ' bar' $t/log1
+grep ' _Z3bazi' $t/log1
 
 $CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,--export-dynamic-symbol-list=$t/dyn
 
 readelf --dyn-syms $t/exe2 > $t/log2
-grep -q ' foo' $t/log2
-! grep -q ' bar' $t/log2 || false
-grep -q ' _Z3bazi' $t/log2
+grep ' foo' $t/log2
+not grep ' bar' $t/log2
+grep ' _Z3bazi' $t/log2
diff --git a/test/dynamic-list3.sh b/test/dynamic-list3.sh
index b2f080c815..1b89e307ac 100755
--- a/test/dynamic-list3.sh
+++ b/test/dynamic-list3.sh
@@ -21,19 +21,19 @@ EOF
 $CC -B. -Wl,--dynamic-list=$t/dyn -o $t/exe1 $t/b.o
 
 readelf --dyn-syms $t/exe1 > $t/log1
-grep -q ' xyz' $t/log1
-! grep -q ' foobarzx' $t/log1 || false
-grep -q ' foobarcx' $t/log1
-grep -q ' foo123bar456bx' $t/log1
-! grep -q ' foo123bar456c' $t/log1 || false
-! grep -q ' foo123bar456x' $t/log1 || false
+grep ' xyz' $t/log1
+not grep ' foobarzx' $t/log1
+grep ' foobarcx' $t/log1
+grep ' foo123bar456bx' $t/log1
+not grep ' foo123bar456c' $t/log1
+not grep ' foo123bar456x' $t/log1
 
 $CC -B. -Wl,--export-dynamic-symbol-list=$t/dyn -o $t/exe2 $t/b.o
 
 readelf --dyn-syms $t/exe2 > $t/log2
-grep -q ' xyz' $t/log2
-! grep -q ' foobarzx' $t/log2 || false
-grep -q ' foobarcx' $t/log2
-grep -q ' foo123bar456bx' $t/log2
-! grep -q ' foo123bar456c' $t/log2 || false
-! grep -q ' foo123bar456x' $t/log2 || false
+grep ' xyz' $t/log2
+not grep ' foobarzx' $t/log2
+grep ' foobarcx' $t/log2
+grep ' foo123bar456bx' $t/log2
+not grep ' foo123bar456c' $t/log2
+not grep ' foo123bar456x' $t/log2
diff --git a/test/dynamic-list4.sh b/test/dynamic-list4.sh
index 83d88887e5..8c8e42f33a 100755
--- a/test/dynamic-list4.sh
+++ b/test/dynamic-list4.sh
@@ -38,7 +38,7 @@ int main() { print(); }
 EOF
 
 $CC -B. -o $t/exe1 $t/e.o -Wl,-push-state,-no-as-needed $t/b.so -Wl,-pop-state
-$QEMU $t/exe1 | grep -q 'foo1 bar1 baz1'
+$QEMU $t/exe1 | grep 'foo1 bar1 baz1'
 
 $CC -B. -o $t/exe2 $t/e.o -Wl,-push-state,-no-as-needed $t/d.so $t/b.so -Wl,-pop-state
-$QEMU $t/exe2 | grep -q 'foo2 bar2 baz1'
+$QEMU $t/exe2 | grep 'foo2 bar2 baz1'
diff --git a/test/dynamic.sh b/test/dynamic.sh
index 2b9576c2ae..0658ae80e9 100755
--- a/test/dynamic.sh
+++ b/test/dynamic.sh
@@ -5,16 +5,7 @@ echo '.globl main; main:' | $CC -o $t/a.o -c -x assembler -
 
 $CC -B. -o $t/exe $t/a.o
 
-readelf --dynamic $t/exe > $t/log
-grep -Eq 'Shared library:.*\blibc\b' $t/log
+readelf --dynamic $t/exe | grep -E 'Shared library:.*\blibc\b'
 
-readelf -W --dyn-syms --use-dynamic $t/exe > $t/log2
-grep -Eq 'FUNC\s+GLOBAL\s+DEFAULT.*UND\s+__libc_start' $t/log2
-
-cat <<EOF | $CC -c -fPIC -o $t/b.o -xc -
-#include <stdio.h>
-
-int main() {
-  printf("Hello world\n");
-}
-EOF
+readelf -W --dyn-syms --use-dynamic $t/exe |
+  grep -E 'FUNC\s+GLOBAL\s+DEFAULT.*UND\s+__libc_start'
diff --git a/test/emit-relocs-cpp.sh b/test/emit-relocs-cpp.sh
index bfe45926d3..63aa5b2beb 100755
--- a/test/emit-relocs-cpp.sh
+++ b/test/emit-relocs-cpp.sh
@@ -9,7 +9,7 @@ int main() { printf("Hello world\n"); }
 EOF
 
 $CXX -B. -o $t/exe $t/a.o -Wl,-emit-relocs
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
 
-readelf -SW $t/exe | grep -Eq 'rela?\.text'
-readelf -SW $t/exe | grep -Eq 'rela?\.eh_frame'
+readelf -SW $t/exe | grep -E 'rela?\.text'
+readelf -SW $t/exe | grep -E 'rela?\.eh_frame'
diff --git a/test/emit-relocs-dead-sections.sh b/test/emit-relocs-dead-sections.sh
index c548e5a643..babf6586d0 100755
--- a/test/emit-relocs-dead-sections.sh
+++ b/test/emit-relocs-dead-sections.sh
@@ -24,4 +24,4 @@ int main() {}
 EOF
 
 $CXX -B. -o $t/exe $t/a.o $t/b.o -Wl,-emit-relocs
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
diff --git a/test/emit-relocs.sh b/test/emit-relocs.sh
index db51e481c7..69cf0a9ef0 100755
--- a/test/emit-relocs.sh
+++ b/test/emit-relocs.sh
@@ -9,6 +9,6 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o -Wl,-emit-relocs
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
 
-readelf -S $t/exe | grep -Eq 'rela?\.text'
+readelf -S $t/exe | grep -E 'rela?\.text'
diff --git a/test/empty-arg.sh b/test/empty-arg.sh
index 60182b07f5..9918abd569 100755
--- a/test/empty-arg.sh
+++ b/test/empty-arg.sh
@@ -1,5 +1,4 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
-! ./mold -m elf_x86_64 '' >& $t/log
-grep -q 'cannot open :' $t/log
+not ./mold -m elf_x86_64 '' |& grep 'cannot open :'
diff --git a/test/empty-file.sh b/test/empty-file.sh
index 21cbaaeabb..41783ddc49 100755
--- a/test/empty-file.sh
+++ b/test/empty-file.sh
@@ -12,4 +12,4 @@ rm -f $t/b.script
 touch $t/b.script
 
 $CC -B. -o $t/exe $t/a.o -Wl,--version-script,$t/b.script
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
diff --git a/test/empty-input.sh b/test/empty-input.sh
index 972a32d51d..df66b232da 100755
--- a/test/empty-input.sh
+++ b/test/empty-input.sh
@@ -3,5 +3,5 @@
 
 rm -f $t/a.o
 touch $t/a.o
-! $CC -B. -o $t/exe $t/a.o &> $t/log || false
-grep -q 'unknown file type' $t/log
+not $CC -B. -o $t/exe $t/a.o &> $t/log
+grep 'unknown file type' $t/log
diff --git a/test/empty-version.sh b/test/empty-version.sh
index 01ec07305b..a800955d32 100755
--- a/test/empty-version.sh
+++ b/test/empty-version.sh
@@ -11,5 +11,5 @@ EOF
 
 $CC -B. -shared -o $t/b.so $t/a.o
 
-readelf --dyn-syms $t/b.so | grep -q ' bar1'
-readelf --dyn-syms $t/b.so | grep -q ' bar2'
+readelf --dyn-syms $t/b.so | grep ' bar1'
+readelf --dyn-syms $t/b.so | grep ' bar2'
diff --git a/test/entry.sh b/test/entry.sh
index 68e4d06125..33ba966536 100755
--- a/test/entry.sh
+++ b/test/entry.sh
@@ -17,8 +17,8 @@ EOF
 
 $CC -B. -o $t/exe1 -Wl,-e,foo $t/a.o $t/b.o
 readelf -e $t/exe1 > $t/log
-grep -q "Entry point address:.*0x1000$" $t/log
+grep "Entry point address:.*0x1000$" $t/log
 
 $CC -B. -o $t/exe2 -Wl,-e,bar $t/a.o $t/b.o
 readelf -e $t/exe2 > $t/log
-grep -q "Entry point address:.*0x2000$" $t/log
+grep "Entry point address:.*0x2000$" $t/log
diff --git a/test/exception-multiple-ehframe.sh b/test/exception-multiple-ehframe.sh
index c411eb9265..5d279ce1b4 100755
--- a/test/exception-multiple-ehframe.sh
+++ b/test/exception-multiple-ehframe.sh
@@ -1,8 +1,8 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
-nm mold | grep -q '__tsan_init' && skip
-
+[ $MACHINE = sh4aeb ] && skip
+nm mold | grep '__tsan_init' && skip
 command -v perl > /dev/null || skip
 
 [ $MACHINE = sh4 ] && skip
@@ -45,4 +45,4 @@ int main() {
 EOF
 
 $CXX -B. -o $t/exe1 $t/d.o $t/c.o
-$QEMU $t/exe1 | grep -q '^1 3$'
+$QEMU $t/exe1 | grep '^1 3$'
diff --git a/test/exclude-libs.sh b/test/exclude-libs.sh
index 3924399123..afd94ad1f3 100755
--- a/test/exclude-libs.sh
+++ b/test/exclude-libs.sh
@@ -32,30 +32,30 @@ EOF
 
 $CC -B. -shared -o $t/f.so $t/e.o $t/c.a $t/d.a
 readelf --dyn-syms $t/f.so > $t/log
-grep -Fq foo $t/log
-grep -Fq bar $t/log
-grep -Fq baz $t/log
+grep -F foo $t/log
+grep -F bar $t/log
+grep -F baz $t/log
 
 $CC -B. -shared -o $t/f.so $t/e.o $t/c.a $t/d.a -Wl,-exclude-libs=c.a
 readelf --dyn-syms $t/f.so > $t/log
-! grep -Fq foo $t/log || false
-grep -Fq bar $t/log
-grep -Fq baz $t/log
+not grep -F foo $t/log
+grep -F bar $t/log
+grep -F baz $t/log
 
 $CC -B. -shared -o $t/f.so $t/e.o $t/c.a $t/d.a -Wl,-exclude-libs=c.a -Wl,-exclude-libs=d.a
 readelf --dyn-syms $t/f.so > $t/log
-! grep -Fq foo $t/log || false
-! grep -Fq bar $t/log || false
-grep -Fq baz $t/log
+not grep -F foo $t/log
+not grep -F bar $t/log
+grep -F baz $t/log
 
 $CC -B. -shared -o $t/f.so $t/e.o $t/c.a $t/d.a -Wl,-exclude-libs=c.a:d.a
 readelf --dyn-syms $t/f.so > $t/log
-! grep -Fq foo $t/log || false
-! grep -Fq bar $t/log || false
-grep -Fq baz $t/log
+not grep -F foo $t/log
+not grep -F bar $t/log
+grep -F baz $t/log
 
 $CC -B. -shared -o $t/f.so $t/e.o $t/c.a $t/d.a -Wl,-exclude-libs=ALL
 readelf --dyn-syms $t/f.so > $t/log
-! grep -Fq foo $t/log || false
-! grep -Fq bar $t/log || false
-grep -Fq baz $t/log
+not grep -F foo $t/log
+not grep -F bar $t/log
+grep -F baz $t/log
diff --git a/test/exclude-libs2.sh b/test/exclude-libs2.sh
index 8b29de45f3..c60b195671 100755
--- a/test/exclude-libs2.sh
+++ b/test/exclude-libs2.sh
@@ -16,5 +16,4 @@ int foo() {
 EOF
 
 $CC -B. -shared -o $t/d.so $t/c.o $t/b.a -Wl,-exclude-libs=b.a
-readelf --dyn-syms $t/d.so > $t/log
-grep -Fq foo $t/log
+readelf --dyn-syms $t/d.so | grep -F foo
diff --git a/test/exclude-libs3.sh b/test/exclude-libs3.sh
index caaf4584c4..ec43621aa8 100755
--- a/test/exclude-libs3.sh
+++ b/test/exclude-libs3.sh
@@ -15,5 +15,4 @@ void foo() { bar(); }
 EOF
 
 $CC -B. -shared -o $t/d.so $t/c.o $t/b.a -Wl,-exclude-libs=ALL
-readelf --dyn-syms $t/d.so > $t/log
-grep -Fq foo $t/log
+readelf --dyn-syms $t/d.so | grep -F foo
diff --git a/test/execstack.sh b/test/execstack.sh
index c809b008fa..954f939bf0 100755
--- a/test/execstack.sh
+++ b/test/execstack.sh
@@ -6,10 +6,10 @@ int main() {}
 EOF
 
 $CC -B. -o $t/exe $t/a.o -Wl,-z,execstack
-readelf --segments -W $t/exe | grep -q 'GNU_STACK.* RWE '
+readelf --segments -W $t/exe | grep 'GNU_STACK.* RWE '
 
 $CC -B. -o $t/exe $t/a.o -Wl,-z,execstack -Wl,-z,noexecstack
-readelf --segments -W $t/exe | grep -q 'GNU_STACK.* RW '
+readelf --segments -W $t/exe | grep 'GNU_STACK.* RW '
 
 $CC -B. -o $t/exe $t/a.o
-readelf --segments -W $t/exe | grep -q 'GNU_STACK.* RW '
+readelf --segments -W $t/exe | grep 'GNU_STACK.* RW '
diff --git a/test/execute-only.sh b/test/execute-only.sh
index ecfe2700a2..be0de75925 100755
--- a/test/execute-only.sh
+++ b/test/execute-only.sh
@@ -16,5 +16,5 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o -Wl,--execute-only
-$QEMU $t/exe | grep -q 'Hello world'
-readelf -W --segments $t/exe | grep -Eq 'LOAD\s.*[0-9a-f]   E 0x'
+$QEMU $t/exe | grep 'Hello world'
+readelf -W --segments $t/exe | grep -E 'LOAD\s.*[0-9a-f]   E 0x'
diff --git a/test/export-dynamic.sh b/test/export-dynamic.sh
index 5dc6ca565c..ab1632097d 100755
--- a/test/export-dynamic.sh
+++ b/test/export-dynamic.sh
@@ -19,5 +19,5 @@ $CC -shared -fPIC -o $t/b.so -xc /dev/null
 ./mold -o $t/exe $t/a.o $t/b.so --export-dynamic
 
 readelf --dyn-syms $t/exe > $t/log
-grep -Eq 'NOTYPE\s+GLOBAL DEFAULT\s+[0-9]+ bar' $t/log
-grep -Eq 'NOTYPE\s+GLOBAL DEFAULT\s+[0-9]+ _start' $t/log
+grep -E 'NOTYPE\s+GLOBAL DEFAULT\s+[0-9]+ bar' $t/log
+grep -E 'NOTYPE\s+GLOBAL DEFAULT\s+[0-9]+ _start' $t/log
diff --git a/test/export-from-exe.sh b/test/export-from-exe.sh
index f1d0ce2375..3589e022e3 100755
--- a/test/export-from-exe.sh
+++ b/test/export-from-exe.sh
@@ -23,5 +23,5 @@ void foo() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o $t/b.so
-readelf --dyn-syms $t/exe | grep -q expfn2
-readelf --dyn-syms $t/exe | grep -q expfn1
+readelf --dyn-syms $t/exe | grep expfn2
+readelf --dyn-syms $t/exe | grep expfn1
diff --git a/test/fatal-warnings.sh b/test/fatal-warnings.sh
index 59146c3a39..9cd58cd351 100755
--- a/test/fatal-warnings.sh
+++ b/test/fatal-warnings.sh
@@ -13,8 +13,6 @@ int main() {
 }
 EOF
 
-$CC -B. -o $t/exe $t/a.o $t/b.o \
-  -Wl,-warn-common 2> /dev/null
+$CC -B. -o $t/exe $t/a.o $t/b.o -Wl,-warn-common 2> /dev/null
 
-! $CC -B. -o $t/exe $t/a.o $t/b.o \
-  -Wl,-warn-common -Wl,-fatal-warnings 2> /dev/null || false
+not $CC -B. -o $t/exe $t/a.o $t/b.o -Wl,-warn-common -Wl,-fatal-warnings 2> /dev/null
diff --git a/test/filter.sh b/test/filter.sh
index bc2a7afec9..97119840ef 100755
--- a/test/filter.sh
+++ b/test/filter.sh
@@ -11,5 +11,5 @@ EOF
 ./mold -o $t/b.so $t/a.o --filter foo -F bar -shared
 
 readelf --dynamic $t/b.so > $t/log
-grep -Fq 'Filter library: [foo]' $t/log
-grep -Fq 'Filter library: [bar]' $t/log
+grep -F 'Filter library: [foo]' $t/log
+grep -F 'Filter library: [bar]' $t/log
diff --git a/test/func-addr.sh b/test/func-addr.sh
index 7c39c5a212..6756572c2b 100755
--- a/test/func-addr.sh
+++ b/test/func-addr.sh
@@ -19,4 +19,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe -no-pie $t/b.o $t/a.so
-$QEMU $t/exe | grep -q 1
+$QEMU $t/exe | grep 1
diff --git a/test/gc-sections.sh b/test/gc-sections.sh
index 4848f11b94..bd8f605bb7 100755
--- a/test/gc-sections.sh
+++ b/test/gc-sections.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
-cat <<EOF > $t/a.cc
+cat <<EOF | $CXX -c -o $t/a.o -xc++ - -ffunction-sections -fdata-sections
 #include <stdio.h>
 
 int two() { return 2; }
@@ -22,28 +22,28 @@ int main() {
 }
 EOF
 
-$CXX -B. -o $t/exe1 $t/a.cc -ffunction-sections -fdata-sections
-
-readelf --symbols $t/exe1 > $t/log.1
-grep -qv live_fn1 $t/log.1
-grep -qv live_fn2 $t/log.1
-grep -qv dead_fn1 $t/log.1
-grep -qv dead_fn2 $t/log.1
-grep -qv live_var1 $t/log.1
-grep -qv live_var2 $t/log.1
-grep -qv dead_var1 $t/log.1
-grep -qv dead_var2 $t/log.1
-$QEMU $t/exe1 | grep -q '1 2'
-
-$CXX -B. -o $t/exe2 $t/a.cc -ffunction-sections -fdata-sections -Wl,-gc-sections
-
-readelf --symbols $t/exe2 > $t/log.2
-grep -q  live_fn1 $t/log.2
-grep -q  live_fn2 $t/log.2
-grep -qv dead_fn1 $t/log.2
-grep -qv dead_fn2 $t/log.2
-grep -q  live_var1 $t/log.2
-grep -q  live_var2 $t/log.2
-grep -qv dead_var1 $t/log.2
-grep -qv dead_var2 $t/log.2
-$QEMU $t/exe2 | grep -q '1 2'
+$CXX -B. -o $t/exe1 $t/a.o
+readelf --symbols $t/exe1 > $t/log1
+$QEMU $t/exe1 | grep '1 2'
+
+grep live_fn1 $t/log1
+grep live_fn2 $t/log1
+grep dead_fn1 $t/log1
+grep dead_fn2 $t/log1
+grep live_var1 $t/log1
+grep live_var2 $t/log1
+grep dead_var1 $t/log1
+grep dead_var2 $t/log1
+
+$CXX -B. -o $t/exe2 $t/a.o -Wl,-gc-sections
+readelf --symbols $t/exe2 > $t/log2
+$QEMU $t/exe2 | grep '1 2'
+
+grep live_fn1 $t/log2
+grep live_fn2 $t/log2
+not grep dead_fn1 $t/log2
+not grep dead_fn2 $t/log2
+grep live_var1 $t/log2
+grep live_var2 $t/log2
+not grep dead_var1 $t/log2
+not grep dead_var2 $t/log2
diff --git a/test/gdb-index-compress-output.sh b/test/gdb-index-compress-output.sh
index 0b180ac921..4a8645510b 100755
--- a/test/gdb-index-compress-output.sh
+++ b/test/gdb-index-compress-output.sh
@@ -3,7 +3,6 @@
 
 on_qemu && skip
 [ $MACHINE = riscv64 -o $MACHINE = riscv32 -o $MACHINE = sparc64 ] && skip
-
 command -v gdb >& /dev/null || skip
 
 cat <<EOF | $CC -c -o $t/a.o -fPIC -g -ggnu-pubnames -gdwarf-4 -xc - -ffunction-sections
@@ -22,7 +21,7 @@ void greet() {
 EOF
 
 $CC -B. -shared -o $t/b.so $t/a.o -Wl,--gdb-index -Wl,--compress-debug-sections=zlib-gabi
-readelf -WS $t/b.so 2> /dev/null | grep -Fq .gdb_index
+readelf -WS $t/b.so 2> /dev/null | grep -F .gdb_index
 
 cat <<EOF | $CC -c -o $t/c.o -fPIC -g -ggnu-pubnames -gdwarf-4 -xc - -gz
 void greet();
@@ -33,13 +32,13 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/b.so $t/c.o -Wl,--gdb-index -Wl,--compress-debug-sections=zlib
-readelf -WS $t/exe 2> /dev/null | grep -Fq .gdb_index
+readelf -WS $t/exe 2> /dev/null | grep -F .gdb_index
 
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
 
 DEBUGINFOD_URLS= gdb $t/exe -nx -batch -ex 'b main' -ex r -ex 'b trap' \
   -ex c -ex bt -ex quit >& $t/log
 
-grep -q 'hello () at .*<stdin>:7' $t/log
-grep -q 'greet () at .*<stdin>:11' $t/log
-grep -q 'main () at .*<stdin>:4' $t/log
+grep 'hello () at .*<stdin>:7' $t/log
+grep 'greet () at .*<stdin>:11' $t/log
+grep 'main () at .*<stdin>:4' $t/log
diff --git a/test/gdb-index-dwarf2.sh b/test/gdb-index-dwarf2.sh
index 79497935d6..070476711d 100755
--- a/test/gdb-index-dwarf2.sh
+++ b/test/gdb-index-dwarf2.sh
@@ -3,10 +3,8 @@
 
 on_qemu && skip
 [ $MACHINE = riscv64 -o $MACHINE = riscv32 -o $MACHINE = sparc64 ] && skip
-
 command -v gdb >& /dev/null || skip
-
-echo 'int main() {}' | $CC -o /dev/null -xc -gdwarf-2 -g - >& /dev/null || skip
+test_cflags -gdwarf-2 -g || skip
 
 cat <<EOF | $CC -c -o $t/a.o -fPIC -g -ggnu-pubnames -gdwarf-2 -xc - -ffunction-sections
 void hello2();
@@ -32,7 +30,7 @@ void hello2() {
 EOF
 
 $CC -B. -shared -o $t/c.so $t/a.o $t/b.o -Wl,--gdb-index
-readelf -WS $t/c.so 2> /dev/null | grep -Fq .gdb_index
+readelf -WS $t/c.so 2> /dev/null | grep -F .gdb_index
 
 cat <<EOF | $CC -c -o $t/d.o -fPIC -g -ggnu-pubnames -gdwarf-2 -xc - -gz
 void greet();
@@ -43,14 +41,14 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/c.so $t/d.o -Wl,--gdb-index
-readelf -WS $t/exe 2> /dev/null | grep -Fq .gdb_index
+readelf -WS $t/exe 2> /dev/null | grep -F .gdb_index
 
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
 
 DEBUGINFOD_URLS= gdb $t/exe -nx -batch -ex 'b main' -ex r -ex 'b trap' \
   -ex c -ex bt -ex quit >& $t/log
 
-grep -q 'hello2 () at .*<stdin>:7' $t/log
-grep -q 'hello () at .*<stdin>:4' $t/log
-grep -q 'greet () at .*<stdin>:8' $t/log
-grep -q 'main () at .*<stdin>:4' $t/log
+grep 'hello2 () at .*<stdin>:7' $t/log
+grep 'hello () at .*<stdin>:4' $t/log
+grep 'greet () at .*<stdin>:8' $t/log
+grep 'main () at .*<stdin>:4' $t/log
diff --git a/test/gdb-index-dwarf3.sh b/test/gdb-index-dwarf3.sh
index a093eade97..e01c234266 100755
--- a/test/gdb-index-dwarf3.sh
+++ b/test/gdb-index-dwarf3.sh
@@ -3,9 +3,7 @@
 
 on_qemu && skip
 [ $MACHINE = riscv64 -o $MACHINE = riscv32 -o $MACHINE = sparc64 ] && skip
-
 command -v gdb >& /dev/null || skip
-
 test_cflags -gdwarf-3 || skip
 
 cat <<EOF | $CC -c -o $t/a.o -fPIC -g -ggnu-pubnames -gdwarf-3 -xc - -ffunction-sections
@@ -32,7 +30,7 @@ void hello2() {
 EOF
 
 $CC -B. -shared -o $t/c.so $t/a.o $t/b.o -Wl,--gdb-index
-readelf -WS $t/c.so 2> /dev/null | grep -Fq .gdb_index
+readelf -WS $t/c.so 2> /dev/null | grep -F .gdb_index
 
 cat <<EOF | $CC -c -o $t/d.o -fPIC -g -ggnu-pubnames -gdwarf-3 -xc - -gz
 void greet();
@@ -43,14 +41,14 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/c.so $t/d.o -Wl,--gdb-index
-readelf -WS $t/exe 2> /dev/null | grep -Fq .gdb_index
+readelf -WS $t/exe 2> /dev/null | grep -F .gdb_index
 
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
 
 DEBUGINFOD_URLS= gdb $t/exe -nx -batch -ex 'b main' -ex r -ex 'b trap' \
   -ex c -ex bt -ex quit >& $t/log
 
-grep -q 'hello2 () at .*<stdin>:7' $t/log
-grep -q 'hello () at .*<stdin>:4' $t/log
-grep -q 'greet () at .*<stdin>:8' $t/log
-grep -q 'main () at .*<stdin>:4' $t/log
+grep 'hello2 () at .*<stdin>:7' $t/log
+grep 'hello () at .*<stdin>:4' $t/log
+grep 'greet () at .*<stdin>:8' $t/log
+grep 'main () at .*<stdin>:4' $t/log
diff --git a/test/gdb-index-dwarf4.sh b/test/gdb-index-dwarf4.sh
index e028fa9907..8b03309322 100755
--- a/test/gdb-index-dwarf4.sh
+++ b/test/gdb-index-dwarf4.sh
@@ -3,9 +3,7 @@
 
 on_qemu && skip
 [ $MACHINE = riscv64 -o $MACHINE = riscv32 -o $MACHINE = sparc64 ] && skip
-
 command -v gdb >& /dev/null || skip
-
 test_cflags -gdwarf-4 -g  || skip
 
 cat <<EOF | $CC -c -o $t/a.o -fPIC -g -ggnu-pubnames -gdwarf-4 -xc - -ffunction-sections
@@ -32,7 +30,7 @@ void hello2() {
 EOF
 
 $CC -B. -shared -o $t/c.so $t/a.o $t/b.o -Wl,--gdb-index
-readelf -WS $t/c.so 2> /dev/null | grep -Fq .gdb_index
+readelf -WS $t/c.so 2> /dev/null | grep -F .gdb_index
 
 cat <<EOF | $CC -c -o $t/d.o -fPIC -g -ggnu-pubnames -gdwarf-4 -xc - -gz
 void greet();
@@ -43,14 +41,14 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/c.so $t/d.o -Wl,--gdb-index
-readelf -WS $t/exe 2> /dev/null | grep -Fq .gdb_index
+readelf -WS $t/exe 2> /dev/null | grep -F .gdb_index
 
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
 
 DEBUGINFOD_URLS= gdb $t/exe -nx -batch -ex 'b main' -ex r -ex 'b trap' \
   -ex c -ex bt -ex quit >& $t/log
 
-grep -q 'hello2 () at .*<stdin>:7' $t/log
-grep -q 'hello () at .*<stdin>:4' $t/log
-grep -q 'greet () at .*<stdin>:8' $t/log
-grep -q 'main () at .*<stdin>:4' $t/log
+grep 'hello2 () at .*<stdin>:7' $t/log
+grep 'hello () at .*<stdin>:4' $t/log
+grep 'greet () at .*<stdin>:8' $t/log
+grep 'main () at .*<stdin>:4' $t/log
diff --git a/test/gdb-index-dwarf5.sh b/test/gdb-index-dwarf5.sh
index 1f3ebc8409..21057b443a 100755
--- a/test/gdb-index-dwarf5.sh
+++ b/test/gdb-index-dwarf5.sh
@@ -3,9 +3,7 @@
 
 on_qemu && skip
 [ $MACHINE = riscv64 -o $MACHINE = riscv32 -o $MACHINE = sparc64 ] && skip
-
 command -v gdb >& /dev/null || skip
-
 test_cflags -gdwarf-5 -g || skip
 
 cat <<EOF > $t/a.c
@@ -64,9 +62,9 @@ $CC -c -o $t/c.o $t/c.c -fPIC -g -ggnu-pubnames -gdwarf-5
 $CC -c -o $t/d.o $t/d.c -fPIC -g -ggnu-pubnames -gdwarf-5 -ffunction-sections
 
 $CC -B. -shared -o $t/e.so $t/a.o $t/b.o $t/c.o $t/d.o -Wl,--gdb-index
-readelf -WS $t/e.so 2> /dev/null | grep -Fq .gdb_index
-readelf --debug=gdb_index $t/e.so 2> /dev/null | grep -q 'fn1: .* \[global, function\]'
-readelf --debug=gdb_index $t/e.so 2> /dev/null | grep -q 'char: .* \[static, type\]'
+readelf -WS $t/e.so 2> /dev/null | grep -F .gdb_index
+readelf --debug=gdb_index $t/e.so 2> /dev/null | grep 'fn1: .* \[global, function\]'
+readelf --debug=gdb_index $t/e.so 2> /dev/null | grep 'char: .* \[static, type\]'
 
 cat <<EOF | $CC -c -o $t/f.o -fPIC -g -ggnu-pubnames -gdwarf-5 -xc - -gz
 void fn1();
@@ -77,19 +75,19 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/e.so $t/f.o -Wl,--gdb-index
-readelf -WS $t/exe 2> /dev/null | grep -Fq .gdb_index
-readelf --debug=gdb_index $t/exe 2> /dev/null | grep -q 'main: .* \[global, function\]'
+readelf -WS $t/exe 2> /dev/null | grep -F .gdb_index
+readelf --debug=gdb_index $t/exe 2> /dev/null | grep 'main: .* \[global, function\]'
 
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
 
 DEBUGINFOD_URLS= gdb $t/exe -nx -batch -ex 'b main' -ex r -ex 'b trap' \
   -ex c -ex bt -ex quit >& $t/log
 
-grep -q 'fn8 () at .*/d.c:6' $t/log
-grep -q 'fn7 () at .*/d.c:10' $t/log
-grep -q 'fn6 () at .*/c.c:4' $t/log
-grep -q 'fn5 () at .*/c.c:8' $t/log
-grep -q 'fn4 () at .*/b.c:4' $t/log
-grep -q 'fn3 () at .*/b.c:8' $t/log
-grep -q 'fn2 () at .*/a.c:4' $t/log
-grep -q 'fn1 () at .*/a.c:8' $t/log
+grep 'fn8 () at .*/d.c:6' $t/log
+grep 'fn7 () at .*/d.c:10' $t/log
+grep 'fn6 () at .*/c.c:4' $t/log
+grep 'fn5 () at .*/c.c:8' $t/log
+grep 'fn4 () at .*/b.c:4' $t/log
+grep 'fn3 () at .*/b.c:8' $t/log
+grep 'fn2 () at .*/a.c:4' $t/log
+grep 'fn1 () at .*/a.c:8' $t/log
diff --git a/test/gdb-index-dwarf64.sh b/test/gdb-index-dwarf64.sh
index 819956ef05..1fe3580249 100755
--- a/test/gdb-index-dwarf64.sh
+++ b/test/gdb-index-dwarf64.sh
@@ -3,7 +3,6 @@
 
 on_qemu && skip
 [ $MACHINE = riscv64 -o $MACHINE = riscv32 -o $MACHINE = sparc64 ] && skip
-
 command -v gdb >& /dev/null || skip
 
 test_cflags -gdwarf-5 -g -gdwarf64 || skip
@@ -66,7 +65,7 @@ $CC -c -o $t/d.o $t/d.c -fPIC -g -ggnu-pubnames -gdwarf-5 -gdwarf64 -ffunction-s
 $CC -B. -shared -o $t/e.so $t/a.o $t/b.o $t/c.o $t/d.o
 
 $CC -B. -shared -o $t/f.so $t/a.o $t/b.o $t/c.o $t/d.o -Wl,--gdb-index
-readelf -WS $t/f.so 2> /dev/null | grep -Fq .gdb_index
+readelf -WS $t/f.so 2> /dev/null | grep -F .gdb_index
 
 cat <<EOF | $CC -c -o $t/g.o -fPIC -g -ggnu-pubnames -gdwarf-5 -xc - -gz
 void fn1();
@@ -79,23 +78,24 @@ EOF
 # Older versions of gdb are buggy that they complain DWARF64 debug sections
 # even without .gdb_index. Skip if such version.
 $CC -B. -o $t/exe1 $t/e.so $t/g.o
-DEBUGINFOD_URLS= gdb $t/exe1 -nx -batch -ex 'b main' -ex r -ex quit >& $t/log1
-grep -q 'DW_FORM_line_strp pointing outside of .debug_line_str' $t/log1 && skip
+
+DEBUGINFOD_URLS= gdb $t/exe1 -nx -batch -ex 'b main' -ex r -ex quit |&
+  grep 'DW_FORM_line_strp pointing outside of .debug_line_str' && skip
 
 # We are using a recent version of gdb.
 $CC -B. -o $t/exe2 $t/f.so $t/g.o -Wl,--gdb-index
-readelf -WS $t/exe2 2> /dev/null | grep -Fq .gdb_index
+readelf -WS $t/exe2 2> /dev/null | grep -F .gdb_index
 
-$QEMU $t/exe2 | grep -q 'Hello world'
+$QEMU $t/exe2 | grep 'Hello world'
 
 DEBUGINFOD_URLS= gdb $t/exe2 -nx -batch -ex 'b main' -ex r -ex 'b trap' \
   -ex c -ex bt -ex quit >& $t/log2
 
-grep -q 'fn8 () at .*/d.c:6' $t/log2
-grep -q 'fn7 () at .*/d.c:10' $t/log2
-grep -q 'fn6 () at .*/c.c:4' $t/log2
-grep -q 'fn5 () at .*/c.c:8' $t/log2
-grep -q 'fn4 () at .*/b.c:4' $t/log2
-grep -q 'fn3 () at .*/b.c:8' $t/log2
-grep -q 'fn2 () at .*/a.c:4' $t/log2
-grep -q 'fn1 () at .*/a.c:8' $t/log2
+grep 'fn8 () at .*/d.c:6' $t/log2
+grep 'fn7 () at .*/d.c:10' $t/log2
+grep 'fn6 () at .*/c.c:4' $t/log2
+grep 'fn5 () at .*/c.c:8' $t/log2
+grep 'fn4 () at .*/b.c:4' $t/log2
+grep 'fn3 () at .*/b.c:8' $t/log2
+grep 'fn2 () at .*/a.c:4' $t/log2
+grep 'fn1 () at .*/a.c:8' $t/log2
diff --git a/test/gdb-index-empty.sh b/test/gdb-index-empty.sh
index f0a6be8bf4..72a571810a 100755
--- a/test/gdb-index-empty.sh
+++ b/test/gdb-index-empty.sh
@@ -3,5 +3,4 @@
 
 echo 'void _start() {}' | $CC -c -o $t/a.o -xc -
 ./mold -o $t/exe $t/a.o -gdb-index
-readelf -WS $t/exe > $t/log
-! grep -Fq .gdb_index $t/log || false
+readelf -WS $t/exe | not grep -F .gdb_index
diff --git a/test/gdb-index-split-dwarf.sh b/test/gdb-index-split-dwarf.sh
index cbb1d30fc7..d7ce7c752c 100755
--- a/test/gdb-index-split-dwarf.sh
+++ b/test/gdb-index-split-dwarf.sh
@@ -3,9 +3,7 @@
 
 on_qemu && skip
 [ $MACHINE = riscv64 -o $MACHINE = riscv32 -o $MACHINE = sparc64 ] && skip
-
 command -v gdb >& /dev/null || skip
-
 test_cflags -gdwarf-5 -g || skip
 
 cat <<EOF > $t/a.c
@@ -64,7 +62,7 @@ $CC -c -o $t/c.o $t/c.c -fPIC -g -ggnu-pubnames -gdwarf-5
 $CC -c -o $t/d.o $t/d.c -fPIC -g -ggnu-pubnames -gdwarf-5 -gsplit-dwarf
 
 $CC -B. -shared -o $t/e.so $t/a.o $t/b.o $t/c.o $t/d.o -Wl,--gdb-index
-readelf -WS $t/e.so 2> /dev/null | grep -Fq .gdb_index
+readelf -WS $t/e.so 2> /dev/null | grep -F .gdb_index
 
 cat <<EOF | $CC -c -o $t/f.o -fPIC -g -ggnu-pubnames -gdwarf-5 -gsplit-dwarf -xc - -gz
 void fn1();
@@ -75,6 +73,6 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/e.so $t/f.o -Wl,--gdb-index
-readelf -WS $t/exe 2> /dev/null | grep -Fq .gdb_index
+readelf -WS $t/exe 2> /dev/null | grep -F .gdb_index
 
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
diff --git a/test/glibc-2.22-bug.sh b/test/glibc-2.22-bug.sh
index 27820accd6..597f2c5104 100755
--- a/test/glibc-2.22-bug.sh
+++ b/test/glibc-2.22-bug.sh
@@ -14,5 +14,5 @@ int main() {
 EOF
 
 $CC -B. -o $t/b.so -shared $t/a.o
-readelf -W --sections $t/b.so | grep -E -A1 '\.rela?\.dyn' | \
-  grep -Eq '\.rela?\.plt'
+readelf -W --sections $t/b.so | grep -E -A1 '\.rela?\.dyn' |
+  grep -E '\.rela?\.plt'
diff --git a/test/global-offset-table.sh b/test/global-offset-table.sh
index 45008ef152..aa4dc09b57 100755
--- a/test/global-offset-table.sh
+++ b/test/global-offset-table.sh
@@ -21,7 +21,7 @@ GOT_ADDR=$($QEMU $t/exe)
 # _GLOBAL_OFFSET_TABLE_ refers the end of .got only on x86.
 # We assume .got is followed by .gotplt.
 if [ $MACHINE = x86_64 -o $MACHINE = i686 ]; then
-  readelf -WS $t/exe | grep -q "\.got\.plt .*$GOT_ADDR "
+  readelf -WS $t/exe | grep "\.got\.plt .*$GOT_ADDR "
 else
-  readelf -WS $t/exe | grep -q "\.got .*$GOT_ADDR "
+  readelf -WS $t/exe | grep "\.got .*$GOT_ADDR "
 fi
diff --git a/test/gnu-property.sh b/test/gnu-property.sh
index aff85c0111..5735d9d1d2 100755
--- a/test/gnu-property.sh
+++ b/test/gnu-property.sh
@@ -6,5 +6,5 @@ int main() {}
 EOF
 
 $CC -B. -o $t/exe $t/a.o -no-pie
-readelf -W --sections $t/exe | grep -Fqw .note.gnu.property || skip
-readelf -W --segments $t/exe | grep -qw GNU_PROPERTY
+readelf -W --sections $t/exe | grep -Fw .note.gnu.property || skip
+readelf -W --segments $t/exe | grep -w GNU_PROPERTY
diff --git a/test/gnu-retain.sh b/test/gnu-retain.sh
index 80bb7690b0..d44c18458b 100755
--- a/test/gnu-retain.sh
+++ b/test/gnu-retain.sh
@@ -10,9 +10,9 @@ int main() {}
 EOF
 
 # Older versions of GCC does not support __attribute__((retain))
-readelf -WS $t/a.o | grep -q '\.text\.foo.*AXR' || skip
+readelf -WS $t/a.o | grep '\.text\.foo.*AXR' || skip
 
 $CC -B. -o $t/exe $t/a.o -Wl,-gc-sections
 nm $t/exe > $t/log
-grep -q foo $t/log
-! grep -q bar $t/log || false
+grep foo $t/log
+not grep bar $t/log
diff --git a/test/gnu-unique.sh b/test/gnu-unique.sh
index ae425e45f0..b15ca52b3d 100755
--- a/test/gnu-unique.sh
+++ b/test/gnu-unique.sh
@@ -19,4 +19,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o $t/b.o -no-pie
-$QEMU $t/exe | grep -q 'foo=5'
+$QEMU $t/exe | grep 'foo=5'
diff --git a/test/gnu-warning.sh b/test/gnu-warning.sh
index 570fb130f7..1b0bf8ad8e 100755
--- a/test/gnu-warning.sh
+++ b/test/gnu-warning.sh
@@ -17,4 +17,4 @@ EOF
 
 # Make sure that we do not copy .gnu.warning.* sections.
 $CC -B. -o $t/exe $t/a.o -no-pie
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
diff --git a/test/hash-style-sysv.sh b/test/hash-style-sysv.sh
new file mode 100755
index 0000000000..ffb14c0330
--- /dev/null
+++ b/test/hash-style-sysv.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -o $t/a.o -c -xc - -fPIC
+#include <stdio.h>
+void hello() {
+  printf("Hello world\n");
+}
+EOF
+
+$CC -B. -shared -o $t/b.so $t/a.o -Wl,--hash-style=sysv
+
+cat <<EOF | $CC -o $t/c.o -c -xc -
+void hello();
+int main() {
+  hello();
+}
+EOF
+
+$CC -B. -o $t/exe $t/c.o $t/b.so -Wl,--hash-style=sysv
+$QEMU $t/exe | grep Hello
diff --git a/test/hash-style.sh b/test/hash-style.sh
index d08171608f..2b24f3b944 100755
--- a/test/hash-style.sh
+++ b/test/hash-style.sh
@@ -9,11 +9,11 @@ EOF
 
 ./mold -shared -o $t/b.so $t/a.o
 
-readelf -WS $t/b.so | grep -Fq ' .hash'
-readelf -WS $t/b.so | grep -Fq ' .gnu.hash'
+readelf -WS $t/b.so | grep -F ' .hash'
+readelf -WS $t/b.so | grep -F ' .gnu.hash'
 
 ./mold -shared -o $t/c.so $t/a.o --hash-style=both --hash-style=none
 
 readelf -WS $t/c.so > $t/log
-! grep -Fq ' .hash' $t/log || false
-! grep -Fq ' .gnu.hash' $t/log || false
+not grep -F ' .hash' $t/log
+not grep -F ' .gnu.hash' $t/log
diff --git a/test/hello-dynamic.sh b/test/hello-dynamic.sh
index 6baa458bd6..4e4338b39d 100755
--- a/test/hello-dynamic.sh
+++ b/test/hello-dynamic.sh
@@ -9,4 +9,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o -no-pie
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
diff --git a/test/hello-static.sh b/test/hello-static.sh
index 8d96806f7c..d497d8f990 100755
--- a/test/hello-static.sh
+++ b/test/hello-static.sh
@@ -11,4 +11,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o -static
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
diff --git a/test/help.sh b/test/help.sh
index 0a44bffe6e..50bffafe14 100755
--- a/test/help.sh
+++ b/test/help.sh
@@ -1,4 +1,4 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
-./mold --help | grep -q Usage
+./mold --help | grep Usage
diff --git a/test/hidden-undef.sh b/test/hidden-undef.sh
index 52226573fd..39b2f9861e 100755
--- a/test/hidden-undef.sh
+++ b/test/hidden-undef.sh
@@ -10,5 +10,4 @@ __attribute__((visibility("hidden"))) void foo();
 int main() { foo(); }
 EOF
 
-! $CC -B. -o $t/exe $t/a.so $t/b.o >& $t/log
-grep -q 'undefined symbol: foo' $t/log
+not $CC -B. -o $t/exe $t/a.so $t/b.o |& grep 'undefined symbol: foo'
diff --git a/test/hidden-weak-undef.sh b/test/hidden-weak-undef.sh
index 0a036f6c9d..d878c044c6 100755
--- a/test/hidden-weak-undef.sh
+++ b/test/hidden-weak-undef.sh
@@ -9,5 +9,5 @@ EOF
 $CC -B. -shared -o $t/b.so $t/a.o
 
 readelf -W --dyn-syms $t/b.so > $t/log
-! grep -qw foo $t/log || false
-grep -qw bar $t/log
+not grep -w foo $t/log
+grep -w bar $t/log
diff --git a/test/icf-safe.sh b/test/icf-safe.sh
index 2911826ffc..9dc6cfe1f6 100755
--- a/test/icf-safe.sh
+++ b/test/icf-safe.sh
@@ -5,7 +5,7 @@
 # instead of R_390_PC32DBL
 if [ $MACHINE = s390x ]; then
   echo 'void *foo() { return foo; }' | $CC -c -o $t/a.o -xc -
-  readelf -r $t/a.o | grep -q R_390_PLT32DBL && skip
+  readelf -r $t/a.o | grep R_390_PLT32DBL && skip
 fi
 
 cat <<EOF | $CC -c -o $t/a.o -ffunction-sections -fdata-sections -xc -
@@ -39,7 +39,7 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe1 -Wl,-icf=safe $t/a.o $t/b.o
-$QEMU $t/exe1 | grep -q '^0 0$'
+$QEMU $t/exe1 | grep '^0 0$'
 
 cat <<EOF | $CC -c -o $t/c.o -ffunction-sections -fdata-sections -xc -
 int foo1();
@@ -54,6 +54,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe2 -Wl,-icf=safe $t/a.o $t/c.o
-$QEMU $t/exe2 > $t/log2
-! grep foo2 $t/log2 || false
-
+$QEMU $t/exe2 | not grep foo2
diff --git a/test/icf.sh b/test/icf.sh
index ea4391a269..23faf6197a 100755
--- a/test/icf.sh
+++ b/test/icf.sh
@@ -35,4 +35,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o -Wl,-icf=all
-$QEMU $t/exe | grep -q '1 0'
+$QEMU $t/exe | grep '1 0'
diff --git a/test/ifunc-address-equality-exported.sh b/test/ifunc-address-equality-exported.sh
index e9de85c0f8..404c9b2f9d 100755
--- a/test/ifunc-address-equality-exported.sh
+++ b/test/ifunc-address-equality-exported.sh
@@ -3,7 +3,7 @@
 
 # Clang miscompiles the test code, so skip it if Clang.
 # https://github.com/llvm/llvm-project/issues/111338
-$CC --version | grep -q clang && skip
+$CC --version | grep clang && skip
 
 supports_ifunc || skip
 
@@ -34,4 +34,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/c.o $t/b.so -no-pie
-$QEMU $t/exe | grep -Eq '^(\S+) \1'
+$QEMU $t/exe | grep -E '^(\S+) \1'
diff --git a/test/ifunc-address-equality.sh b/test/ifunc-address-equality.sh
index 2ba8fdd3cf..cc66a762bf 100755
--- a/test/ifunc-address-equality.sh
+++ b/test/ifunc-address-equality.sh
@@ -46,7 +46,6 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o -no-pie
-$QEMU $t/exe1 | grep -Eq '^(\S+) \1 (\S+) \2'
+$QEMU $t/exe1 | grep -E '^(\S+) \1 (\S+) \2'
 
-readelf --dynamic $t/exe1 > $t/log1
-! grep -q TEXTREL $t/log1 || false
+readelf --dynamic $t/exe1 | not grep TEXTREL
diff --git a/test/ifunc-alias.sh b/test/ifunc-alias.sh
index ac6cdee7fa..6a296ea860 100755
--- a/test/ifunc-alias.sh
+++ b/test/ifunc-alias.sh
@@ -17,7 +17,7 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o -pie
-$QEMU $t/exe1 | grep -Eq '^(\S+) \1$'
+$QEMU $t/exe1 | grep -E '^(\S+) \1$'
 
 $CC -B. -o $t/exe2 $t/a.o -no-pie
-$QEMU $t/exe2 | grep -Eq '^(\S+) \1$'
+$QEMU $t/exe2 | grep -E '^(\S+) \1$'
diff --git a/test/ifunc-dlopen.sh b/test/ifunc-dlopen.sh
index 2261d881c9..20cce9e191 100755
--- a/test/ifunc-dlopen.sh
+++ b/test/ifunc-dlopen.sh
@@ -39,4 +39,4 @@ EOF
 
 $CC -B. -o $t/c.so $t/b.o -shared
 $CC -B. -o $t/exe $t/a.o $t/c.so -no-pie -ldl
-$QEMU $t/exe | grep -q 'foo foo'
+$QEMU $t/exe | grep 'foo foo'
diff --git a/test/ifunc-dso.sh b/test/ifunc-dso.sh
index f231238717..a2da78a1ae 100755
--- a/test/ifunc-dso.sh
+++ b/test/ifunc-dso.sh
@@ -29,7 +29,7 @@ static Func *resolve_foobar(void) {
 EOF
 
 $CC -B. -o $t/c.so $t/b.o -shared
-readelf -W --dyn-syms $t/c.so | grep -Eq '(IFUNC|<OS specific>: 10).*foobar'
+readelf -W --dyn-syms $t/c.so | grep -E '(IFUNC|<OS specific>: 10).*foobar'
 
 $CC -B. -o $t/exe $t/a.o $t/c.so
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
diff --git a/test/ifunc-dynamic.sh b/test/ifunc-dynamic.sh
index 7b5b341100..bd72beed9a 100755
--- a/test/ifunc-dynamic.sh
+++ b/test/ifunc-dynamic.sh
@@ -25,7 +25,7 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o -Wl,-z,lazy
-$QEMU $t/exe1 | grep -q 'Hello world'
+$QEMU $t/exe1 | grep 'Hello world'
 
 $CC -B. -o $t/exe2 $t/a.o -Wl,-z,now
-$QEMU $t/exe2 | grep -q 'Hello world'
+$QEMU $t/exe2 | grep 'Hello world'
diff --git a/test/ifunc-export.sh b/test/ifunc-export.sh
index 2944d8662a..76f4028214 100755
--- a/test/ifunc-export.sh
+++ b/test/ifunc-export.sh
@@ -21,4 +21,4 @@ Func *resolve_foobar(void) {
 EOF
 
 $CC -B. -shared -o $t/b.so $t/a.o
-readelf --dyn-syms $t/b.so | grep -Eq '(IFUNC|<OS specific>: 10)\s+GLOBAL DEFAULT.* foobar'
+readelf --dyn-syms $t/b.so | grep -E '(IFUNC|<OS specific>: 10)\s+GLOBAL DEFAULT.* foobar'
diff --git a/test/ifunc-funcptr.sh b/test/ifunc-funcptr.sh
index 641eed241e..777b22938a 100755
--- a/test/ifunc-funcptr.sh
+++ b/test/ifunc-funcptr.sh
@@ -34,7 +34,7 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o -pie
-$QEMU $t/exe1 | grep -q '^3$'
+$QEMU $t/exe1 | grep '^3$'
 
 $CC -B. -o $t/exe2 $t/a.o $t/b.o $t/c.o -no-pie
-$QEMU $t/exe2 | grep -q '^3$'
+$QEMU $t/exe2 | grep '^3$'
diff --git a/test/ifunc-noplt.sh b/test/ifunc-noplt.sh
index 8a55dbe792..2be2f3e5fe 100755
--- a/test/ifunc-noplt.sh
+++ b/test/ifunc-noplt.sh
@@ -25,7 +25,7 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o -pie
-$QEMU $t/exe1 | grep -q 'Hello world'
+$QEMU $t/exe1 | grep 'Hello world'
 
 $CC -B. -o $t/exe2 $t/a.o -no-pie
-$QEMU $t/exe2 | grep -q 'Hello world'
+$QEMU $t/exe2 | grep 'Hello world'
diff --git a/test/ifunc-static-pie.sh b/test/ifunc-static-pie.sh
index 6fada8b286..4542f05307 100755
--- a/test/ifunc-static-pie.sh
+++ b/test/ifunc-static-pie.sh
@@ -24,4 +24,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe2 $t/a.o -static-pie
-$QEMU $t/exe2 | grep -q 'Hello world'
+$QEMU $t/exe2 | grep 'Hello world'
diff --git a/test/ifunc-static.sh b/test/ifunc-static.sh
index f17c79a003..89007b09af 100755
--- a/test/ifunc-static.sh
+++ b/test/ifunc-static.sh
@@ -24,4 +24,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o -static
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
diff --git a/test/image-base.sh b/test/image-base.sh
index 92c35f0f78..253c48d2d5 100755
--- a/test/image-base.sh
+++ b/test/image-base.sh
@@ -11,8 +11,8 @@ int main() {
 EOF
 
 $CC -B. -no-pie -o $t/exe1 $t/a.o -Wl,--image-base=0x8000000
-$QEMU $t/exe1 | grep -q 'Hello world'
-readelf -W --sections $t/exe1 | grep -Eq '.interp\s+PROGBITS\s+0*8000...\b'
+$QEMU $t/exe1 | grep 'Hello world'
+readelf -W --sections $t/exe1 | grep -E '.interp\s+PROGBITS\s+0*8000...\b'
 
 cat <<EOF | $CC -o $t/b.o -c -xc -
 void _start() {}
@@ -20,5 +20,5 @@ EOF
 
 if [ $MACHINE = x86-64 -o $MACHINE = aarch64 ]; then
   $CC -B. -no-pie -o $t/exe2 $t/b.o -nostdlib -Wl,--image-base=0xffffffff80000000
-  readelf -W --sections $t/exe2 | grep -Eq '.interp\s+PROGBITS\s+ffffffff80000...\b'
+  readelf -W --sections $t/exe2 | grep -E '.interp\s+PROGBITS\s+ffffffff80000...\b'
 fi
diff --git a/test/init-array-priorities.sh b/test/init-array-priorities.sh
index 1352bf06e6..d7fb662c26 100755
--- a/test/init-array-priorities.sh
+++ b/test/init-array-priorities.sh
@@ -49,4 +49,4 @@ int main() {}
 EOF
 
 $CC -B. -o $t/exe $t/a.o $t/b.o $t/c.o $t/d.o $t/e.o $t/f.o $t/g.o $t/h.o $t/i.o
-$QEMU $t/exe | grep -q '21348756'
+$QEMU $t/exe | grep '21348756'
diff --git a/test/init-in-dso.sh b/test/init-in-dso.sh
index 027aae70d9..691e1f8003 100755
--- a/test/init-in-dso.sh
+++ b/test/init-in-dso.sh
@@ -11,5 +11,4 @@ int main() {}
 EOF
 
 $CC -B. -o $t/exe $t/a.so $t/b.o -Wl,-init,foo
-readelf --dynamic $t/exe > $t/log
-! grep -Fq '(INIT)' $t/log || false
+readelf --dynamic $t/exe | not grep -F '(INIT)'
diff --git a/test/init.sh b/test/init.sh
index dbeaffdf04..b769fe8a6b 100755
--- a/test/init.sh
+++ b/test/init.sh
@@ -7,8 +7,7 @@ int main() {}
 EOF
 
 $CC -B. -o $t/exe $t/a.o -Wl,-init,foo
-readelf --dynamic $t/exe | grep -Fq '(INIT)'
+readelf --dynamic $t/exe | grep -F '(INIT)'
 
 $CC -B. -o $t/exe $t/a.o -Wl,-init,no-such-symbol
-readelf --dynamic $t/exe > $t/log
-! grep -Fq '(INIT)' $t/log || false
+readelf --dynamic $t/exe | not grep -F '(INIT)'
diff --git a/test/initfirst.sh b/test/initfirst.sh
index 53f3b76239..653d7bd0d5 100755
--- a/test/initfirst.sh
+++ b/test/initfirst.sh
@@ -10,4 +10,4 @@ void foo() {
 EOF
 
 $CC -B. -shared -o $t/b.so $t/a.o -Wl,-z,initfirst
-readelf --dynamic $t/b.so | grep -q 'Flags:.*INITFIRST'
+readelf --dynamic $t/b.so | grep 'Flags:.*INITFIRST'
diff --git a/test/interpose.sh b/test/interpose.sh
index ba5a090769..de51ffe052 100755
--- a/test/interpose.sh
+++ b/test/interpose.sh
@@ -10,4 +10,4 @@ void foo() {
 EOF
 
 $CC -B. -shared -o $t/b.so $t/a.o -Wl,-z,interpose
-readelf --dynamic $t/b.so | grep -q 'Flags:.*INTERPOSE'
+readelf --dynamic $t/b.so | grep 'Flags:.*INTERPOSE'
diff --git a/test/invalid-version-script.sh b/test/invalid-version-script.sh
index ed75b00059..0de6892f47 100755
--- a/test/invalid-version-script.sh
+++ b/test/invalid-version-script.sh
@@ -5,6 +5,5 @@ echo 'int main() {}' | $CC -c -o $t/a.o -xc -
 
 echo 'VER1 { foo[12; };' > $t/b.ver
 
-! $CC -B. -shared -o $t/c.so -Wl,-version-script,$t/b.ver \
-  $t/a.o >& $t/log || false
-grep -q 'invalid version pattern' $t/log
+not $CC -B. -shared -o $t/c.so -Wl,-version-script,$t/b.ver $t/a.o |&
+  grep 'invalid version pattern'
diff --git a/test/issue646.sh b/test/issue646.sh
index a33f473e88..d9dac793af 100755
--- a/test/issue646.sh
+++ b/test/issue646.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
+[ $MACHINE = sh4aeb ] && skip
+
 cat <<EOF | $CXX -o $t/a.o -c -xc++ -
 #include <iostream>
 #include <stdexcept>
@@ -24,4 +26,4 @@ int main() {
 EOF
 
 $CXX -B. -o $t/exe $t/a.o
-$QEMU $t/exe | grep -q 'error: exception'
+$QEMU $t/exe | grep 'error: exception'
diff --git a/test/large-alignment-dso.sh b/test/large-alignment-dso.sh
index 16835effa5..f061ec38f4 100755
--- a/test/large-alignment-dso.sh
+++ b/test/large-alignment-dso.sh
@@ -32,4 +32,4 @@ int main() { greet(); }
 EOF
 
 $CC -B. -o $t/exe $t/c.o $t/b.so
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
diff --git a/test/large-alignment.sh b/test/large-alignment.sh
index 04a48b0850..35d1d5e7d2 100755
--- a/test/large-alignment.sh
+++ b/test/large-alignment.sh
@@ -33,4 +33,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
diff --git a/test/large-max-page-size-strip.sh b/test/large-max-page-size-strip.sh
index a4262be74b..071c45d9c0 100755
--- a/test/large-max-page-size-strip.sh
+++ b/test/large-max-page-size-strip.sh
@@ -16,4 +16,4 @@ EOF
 
 $CC -B. -o $t/exe $t/a.o -pie -Wl,-zmax-page-size=0x200000
 $strip $t/exe
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
diff --git a/test/large-max-page-size.sh b/test/large-max-page-size.sh
index a887ef732f..65cb0a7ca8 100755
--- a/test/large-max-page-size.sh
+++ b/test/large-max-page-size.sh
@@ -9,4 +9,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o -pie -Wl,-zmax-page-size=0x200000
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
diff --git a/test/large-text.sh b/test/large-text.sh
index 221bcd21ff..bea858d406 100755
--- a/test/large-text.sh
+++ b/test/large-text.sh
@@ -13,4 +13,4 @@ int main() { printf("Hello world\n"); }
 EOF
 
 $CC -B. -o $t/exe $t/a.o $t/b.o
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
diff --git a/test/library.sh b/test/library.sh
index 91d40bff55..907b17d144 100755
--- a/test/library.sh
+++ b/test/library.sh
@@ -16,7 +16,7 @@ int main() { hello(); }
 EOF
 
 $CC -B. -o $t/exe1 $t/c.o -L$t -Wl,--library,foobar -Wl,-rpath,$t
-$QEMU $t/exe1 | grep -q 'Hello world'
+$QEMU $t/exe1 | grep 'Hello world'
 
 $CC -B. -o $t/exe2 $t/c.o -L$t -Wl,--library=foobar -Wl,-rpath,$t
-$QEMU $t/exe2 | grep -q 'Hello world'
+$QEMU $t/exe2 | grep 'Hello world'
diff --git a/test/link-order.sh b/test/link-order.sh
index 614ffc1810..1ba1a5762b 100755
--- a/test/link-order.sh
+++ b/test/link-order.sh
@@ -16,7 +16,7 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/b.o -Wl,--as-needed $t/libfoo.so $t/libfoo.a
-readelf --dynamic $t/exe | grep -q libfoo
+readelf --dynamic $t/exe | grep libfoo
 
 $CC -B. -o $t/exe $t/b.o -Wl,--as-needed $t/libfoo.a $t/libfoo.so
-! readelf --dynamic $t/exe | grep -q libfoo || false
+readelf --dynamic $t/exe | not grep libfoo
diff --git a/test/linker-script-defsym.sh b/test/linker-script-defsym.sh
index c8c9d2e14d..a945c434bf 100755
--- a/test/linker-script-defsym.sh
+++ b/test/linker-script-defsym.sh
@@ -10,7 +10,7 @@ bar = foo;
 EOF
 
 $CC -B. -o $t/b.so -shared $t/script $t/a.o
-readelf -sW $t/b.so | grep -q 'FUNC .* bar'
+readelf -sW $t/b.so | grep 'FUNC .* bar'
 
 cat <<EOF | $CC -o $t/c.o -c -xc -
 #include <stdio.h>
@@ -22,4 +22,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/c.o $t/b.so
-$QEMU $t/exe | grep -q 42
+$QEMU $t/exe | grep 42
diff --git a/test/linker-script-error.sh b/test/linker-script-error.sh
index 75f9947333..d5267f97da 100755
--- a/test/linker-script-error.sh
+++ b/test/linker-script-error.sh
@@ -7,5 +7,4 @@ EOF
 
 echo 'VERSION { ver_x /*' > $t/b.script
 
-! $CC -B. -o $t/exe $t/a.o $t/b.script 2> $t/log
-grep -q 'unclosed comment' $t/log
+not $CC -B. -o $t/exe $t/a.o $t/b.script |& grep 'unclosed comment'
diff --git a/test/linker-script-relocatable.sh b/test/linker-script-relocatable.sh
index 8830ba134b..acaf2bcdbe 100755
--- a/test/linker-script-relocatable.sh
+++ b/test/linker-script-relocatable.sh
@@ -2,7 +2,7 @@
 . $(dirname $0)/common.inc
 
 # OneTBB isn't tsan-clean
-nm mold | grep -q '__tsan_init' && skip
+nm mold | grep '__tsan_init' && skip
 
 cat <<EOF | $CC -c -o $t/a.o -xc -
 #include <stdio.h>
@@ -19,4 +19,4 @@ echo "INPUT($t/a.o $t/b.o)" > $t/c.script
 ./mold --relocatable -o $t/d.o $t/c.script
 
 $CC -B. -o $t/exe $t/d.o
-$QEMU $t/exe | grep -q Hello
+$QEMU $t/exe | grep Hello
diff --git a/test/linker-script.sh b/test/linker-script.sh
index 8685500c4a..7b210e3ffc 100755
--- a/test/linker-script.sh
+++ b/test/linker-script.sh
@@ -13,10 +13,10 @@ GROUP("$t/a.o")
 EOF
 
 $CC -B. -o $t/exe $t/script
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
 
 $CC -B. -o $t/exe -Wl,-T,$t/script
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
 
 $CC -B. -o $t/exe -Wl,--script,$t/script
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
diff --git a/test/linker-script4.sh b/test/linker-script4.sh
index ebc288c3e6..955e24db62 100755
--- a/test/linker-script4.sh
+++ b/test/linker-script4.sh
@@ -16,4 +16,4 @@ EOF
 $CC -B. -shared -o $t/c.so $t/a.script $t/b.s
 readelf --version-info $t/c.so > $t/log
 
-grep -Fq 'Rev: 1  Flags: none  Index: 2  Cnt: 1  Name: ver_x' $t/log
+grep -F 'Rev: 1  Flags: none  Index: 2  Cnt: 1  Name: ver_x' $t/log
diff --git a/test/lto-archive.sh b/test/lto-archive.sh
index 3938a9b2b9..a8844bf905 100755
--- a/test/lto-archive.sh
+++ b/test/lto-archive.sh
@@ -29,8 +29,8 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe -flto $t/d.o $t/c.a
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
 
 nm $t/exe > $t/log
-grep -q hello $t/log
-! grep -q howdy $t/log || false
+grep hello $t/log
+not grep howdy $t/log
diff --git a/test/lto-dso.sh b/test/lto-dso.sh
index 61c27794fd..84e0398132 100755
--- a/test/lto-dso.sh
+++ b/test/lto-dso.sh
@@ -11,7 +11,7 @@ $CC -B. -shared -o $t/b.so -flto $t/a.o
 
 if [ $MACHINE = ppc64 ]; then
   # On PPC64V1, function symbol refers a function descriptor in .opd
-  nm -D $t/b.so | grep -q 'D foo'
+  nm -D $t/b.so | grep 'D foo'
 else
-  nm -D $t/b.so | grep -q 'T foo'
+  nm -D $t/b.so | grep 'T foo'
 fi
diff --git a/test/lto-gcc.sh b/test/lto-gcc.sh
index 105045d1fc..c9d900f252 100755
--- a/test/lto-gcc.sh
+++ b/test/lto-gcc.sh
@@ -1,8 +1,7 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
-echo 'int main() {}' | $GCC -B. -flto -o /dev/null -xc - >& /dev/null \
-  || skip
+echo 'int main() {}' | $GCC -B. -flto -o /dev/null -xc - >& /dev/null || skip
 
 cat <<EOF | $GCC -flto -c -o $t/a.o -xc -
 #include <stdio.h>
@@ -12,7 +11,7 @@ int main() {
 EOF
 
 $GCC -B. -o $t/exe1 -flto $t/a.o
-$QEMU $t/exe1 | grep -q 'Hello world'
+$QEMU $t/exe1 | grep 'Hello world'
 
 # Test that LTO is used for FAT LTO objects
 cat <<EOF | $GCC -flto -ffat-lto-objects -c -o $t/b.o -xc -
@@ -22,7 +21,7 @@ int main() {
 }
 EOF
 
-$GCC -B. -o $t/exe2 $t/b.o --verbose 2>&1 | grep -q -- -fwpa
+$GCC -B. -o $t/exe2 $t/b.o --verbose |& grep -- -fwpa
 
 # Test FAT objects if -fno-use-linker-plugin is used
 
@@ -34,4 +33,4 @@ int main() {
 EOF
 
 $GCC -B. -o $t/exe3 -flto -fno-use-linker-plugin $t/c.o
-$QEMU $t/exe3 | grep -q 'Hello world'
+$QEMU $t/exe3 | grep 'Hello world'
diff --git a/test/lto-llvm.sh b/test/lto-llvm.sh
index f32c661252..dd993b2ea5 100755
--- a/test/lto-llvm.sh
+++ b/test/lto-llvm.sh
@@ -3,8 +3,7 @@
 
 [ $MACHINE = $(uname -m) ] || skip
 
-echo 'int main() {}' | clang -B. -flto -o /dev/null -xc - >& /dev/null \
-  || skip
+echo 'int main() {}' | clang -B. -flto -o /dev/null -xc - >& /dev/null || skip
 
 cat <<EOF | clang -flto -c -o $t/a.o -xc -
 #include <stdio.h>
@@ -14,4 +13,4 @@ int main() {
 EOF
 
 clang -B. -o $t/exe -flto $t/a.o
-$t/exe | grep -q 'Hello world'
+$t/exe | grep 'Hello world'
diff --git a/test/lto-no-plugin.sh b/test/lto-no-plugin.sh
new file mode 100755
index 0000000000..2d770f26e3
--- /dev/null
+++ b/test/lto-no-plugin.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+[ $MACHINE = $(uname -m) ] || skip
+
+echo 'int main() {}' | clang -B. -flto -o /dev/null -xc - >& /dev/null || skip
+
+echo 'int main() {}' | clang -c -o $t/a.o -xc -
+echo 'void foo() {}' | clang -c -o $t/b.o -xc - -flto
+
+not ./mold -o /dev/null $t/a.o $t/b.o |&
+  grep "b.o: don't know how to handle this LTO object file"
diff --git a/test/lto-nostdlib.sh b/test/lto-nostdlib.sh
index dc0eaf66e0..f84f918a5c 100755
--- a/test/lto-nostdlib.sh
+++ b/test/lto-nostdlib.sh
@@ -8,4 +8,4 @@ void _start() {}
 EOF
 
 $CC -B. -o $t/exe -flto $t/a.o -nostdlib
-readelf -Ws $t/exe | grep -Eq ' _start'
+readelf -Ws $t/exe | grep -E ' _start'
diff --git a/test/lto-version-script.sh b/test/lto-version-script.sh
index a6fce5fd49..b483b41339 100755
--- a/test/lto-version-script.sh
+++ b/test/lto-version-script.sh
@@ -19,9 +19,9 @@ $CC -B. -shared -o $t/c.so -flto $t/a.o -Wl,-version-script=$t/b.script
 
 if [ $MACHINE = ppc64 ]; then
   # On PPC64V1, function symbol refers a function descriptor in .opd
-  nm -D $t/c.so | grep -q 'D foo'
-  ! nm -D $t/c.so | grep -q 'D bar' || false
+  nm -D $t/c.so | grep 'D foo'
+  nm -D $t/c.so | not grep 'D bar'
 else
-  nm -D $t/c.so | grep -q 'T foo'
-  ! nm -D $t/c.so | grep -q 'T bar' || false
+  nm -D $t/c.so | grep 'T foo'
+  nm -D $t/c.so | not grep 'T bar'
 fi
diff --git a/test/main-in-dso.sh b/test/main-in-dso.sh
index 67c062bde7..03a68e6774 100755
--- a/test/main-in-dso.sh
+++ b/test/main-in-dso.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
+[ $MACHINE = ppc64 ] && skip
+
 cat <<EOF | $CC -o $t/a.o -c -xc - -fPIC
 #include <stdio.h>
 int main() {
@@ -12,8 +14,8 @@ $CC -B. -shared -o $t/b.so $t/a.o
 
 $CC -o $t/c.o -c -xc /dev/null -fPIC
 $CC -B. -o $t/exe1 $t/c.o $t/b.so -pie
-$QEMU $t/exe1 | grep -q 'Hello world'
+$QEMU $t/exe1 | grep 'Hello world'
 
 $CC -o $t/c.o -c -xc /dev/null -fno-PIC
 $CC -B. -o $t/exe2 $t/c.o $t/b.so -no-pie
-$QEMU $t/exe2 | grep -q 'Hello world'
+$QEMU $t/exe2 | grep 'Hello world'
diff --git a/test/many-sections.sh b/test/many-sections.sh
index b0ee2bbaaa..bfd3aeb14a 100755
--- a/test/many-sections.sh
+++ b/test/many-sections.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
-seq 1 100000 | sed 's/.*/.section .data.\0,"aw"\n.word 0\n/g' | \
+seq 1 100000 | sed 's/.*/.section .data.\0,"aw"\n.word 0\n/g' |
   $CC -c -xassembler -o $t/a.o -
 
 cat <<'EOF' | $CC -c -xc -o $t/b.o -
@@ -14,4 +14,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o $t/b.o
-$QEMU $t/exe | grep -q Hello
+$QEMU $t/exe | grep Hello
diff --git a/test/many-sections2.sh b/test/many-sections2.sh
index 7d37be197f..2d4da81500 100755
--- a/test/many-sections2.sh
+++ b/test/many-sections2.sh
@@ -2,7 +2,7 @@
 . $(dirname $0)/common.inc
 
 # OneTBB isn't tsan-clean
-nm mold | grep -q '__tsan_init' && skip
+nm mold | grep '__tsan_init' && skip
 
 echo 'foo = 0x1000' > $t/a.s
 seq 1 100000 | sed 's/.*/.section .data.&,"aw"\n.globl x&\nx&: .word 0\n/g' >> $t/a.s
@@ -11,9 +11,9 @@ $CC -c -xassembler -o $t/a.o $t/a.s
 ./mold --relocatable -o $t/b.o $t/a.o
 
 readelf -WS $t/b.o > $t/log1
-grep -Fq .data.100000 $t/log1
+grep -F .data.100000 $t/log1
 
 readelf -Ws $t/b.o > $t/log2
-grep -Fq 'GLOBAL DEFAULT 100000' $t/log2
-grep -Fq 'ABS foo' $t/log2
-! grep -Fq 'ABS x68966' $t/log2 || false
+grep -F 'GLOBAL DEFAULT 100000' $t/log2
+grep -F 'ABS foo' $t/log2
+not grep -F 'ABS x68966' $t/log2
diff --git a/test/mcmodel-large.sh b/test/mcmodel-large.sh
new file mode 100755
index 0000000000..7697aa5570
--- /dev/null
+++ b/test/mcmodel-large.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+test_cflags -static -mcmodel=large || skip
+
+cat <<EOF | $CC -o $t/a.o -c -xc - -fno-PIC -mcmodel=large
+#include <stdio.h>
+int main() {
+  printf("Hello world\n");
+}
+EOF
+
+$CC -B. -o $t/exe $t/a.o -static
+$QEMU $t/exe | grep 'Hello world'
diff --git a/test/mergeable-strings.sh b/test/mergeable-strings.sh
index d46c3a9b9a..307a95189d 100755
--- a/test/mergeable-strings.sh
+++ b/test/mergeable-strings.sh
@@ -36,7 +36,7 @@ EOF
 # linker first to verify that it does work on this system.
 $CC -o $t/exe1 $t/a.o $t/b.o -no-pie
 
-if $QEMU $t/exe1 | grep -Eq '^(\S+) \1 (\S+) \2 (\S+) \3 (\S+) \4$'; then
+if $QEMU $t/exe1 | grep -E '^(\S+) \1 (\S+) \2 (\S+) \3 (\S+) \4$'; then
   $CC -B. -o $t/exe2 $t/a.o $t/b.o -no-pie
-  $QEMU $t/exe2 | grep -Eq '^(\S+) \1 (\S+) \2 (\S+) \3 (\S+) \4$'
+  $QEMU $t/exe2 | grep -E '^(\S+) \1 (\S+) \2 (\S+) \3 (\S+) \4$'
 fi
diff --git a/test/missing-error.sh b/test/missing-error.sh
index 265f9f8d5d..e005b3c819 100755
--- a/test/missing-error.sh
+++ b/test/missing-error.sh
@@ -9,6 +9,7 @@ int main() {
 }
 EOF
 
-! ./mold -o $t/exe $t/a.o 2> $t/log || false
-grep -q 'undefined symbol: foo' $t/log
-grep -q '>>> .*a\.o' $t/log
+not ./mold -o $t/exe $t/a.o 2> $t/log
+
+grep 'undefined symbol: foo' $t/log
+grep '>>> .*a\.o' $t/log
diff --git a/test/mold-wrapper.sh b/test/mold-wrapper.sh
index 4748c8d74c..9a4e37e579 100755
--- a/test/mold-wrapper.sh
+++ b/test/mold-wrapper.sh
@@ -3,9 +3,9 @@
 
 [ "$CC" = cc ] || skip
 
-ldd mold-wrapper.so | grep -q libasan && skip
+ldd mold-wrapper.so | grep libasan && skip
 
-nm mold | grep -q '__[at]san_init' && skip
+nm mold | grep '__[at]san_init' && skip
 
 cat <<'EOF' > $t/a.sh
 #!/usr/bin/env bash
@@ -67,9 +67,9 @@ int main(int argc, char **argv) {
 }
 EOF
 
-LD_PRELOAD=`pwd`/mold-wrapper.so MOLD_PATH=$t/a.sh $t/exe execl | grep -q 'a.sh execl'
-LD_PRELOAD=`pwd`/mold-wrapper.so MOLD_PATH=$t/a.sh $t/exe execlp | grep -q 'a.sh execlp'
-LD_PRELOAD=`pwd`/mold-wrapper.so MOLD_PATH=$t/a.sh $t/exe execle | grep -q 'a.sh execle'
-LD_PRELOAD=`pwd`/mold-wrapper.so MOLD_PATH=$t/a.sh $t/exe execv | grep -q 'a.sh execv'
-LD_PRELOAD=`pwd`/mold-wrapper.so MOLD_PATH=$t/a.sh $t/exe execvp | grep -q 'a.sh execvp'
-LD_PRELOAD=`pwd`/mold-wrapper.so MOLD_PATH=$t/a.sh $t/exe execvpe | grep -q 'a.sh execvpe bar'
+LD_PRELOAD=`pwd`/mold-wrapper.so MOLD_PATH=$t/a.sh $t/exe execl | grep 'a.sh execl'
+LD_PRELOAD=`pwd`/mold-wrapper.so MOLD_PATH=$t/a.sh $t/exe execlp | grep 'a.sh execlp'
+LD_PRELOAD=`pwd`/mold-wrapper.so MOLD_PATH=$t/a.sh $t/exe execle | grep 'a.sh execle'
+LD_PRELOAD=`pwd`/mold-wrapper.so MOLD_PATH=$t/a.sh $t/exe execv | grep 'a.sh execv'
+LD_PRELOAD=`pwd`/mold-wrapper.so MOLD_PATH=$t/a.sh $t/exe execvp | grep 'a.sh execvp'
+LD_PRELOAD=`pwd`/mold-wrapper.so MOLD_PATH=$t/a.sh $t/exe execvpe | grep 'a.sh execvpe bar'
diff --git a/test/mold-wrapper2.sh b/test/mold-wrapper2.sh
index efeaf05b92..2e4605bd6d 100755
--- a/test/mold-wrapper2.sh
+++ b/test/mold-wrapper2.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
-ldd mold-wrapper.so | grep -q libasan && skip
-nm mold | grep -q '__[at]san_init' && skip
+ldd mold-wrapper.so | grep libasan && skip
+nm mold | grep '__[at]san_init' && skip
 
-./mold -run bash -c 'echo $LD_PRELOAD' | grep -Fq mold-wrapper.so
+./mold -run bash -c 'echo $LD_PRELOAD' | grep -F mold-wrapper.so
diff --git a/test/no-allow-shlib-undefined.sh b/test/no-allow-shlib-undefined.sh
index 846e046a13..a6397e8210 100755
--- a/test/no-allow-shlib-undefined.sh
+++ b/test/no-allow-shlib-undefined.sh
@@ -17,5 +17,5 @@ EOF
 
 $CC -B. -o $t/exe1 $t/a.o -Wl,--no-allow-shlib-undefined -L$t -lfoo -lbar
 
-! $CC -B. -o $t/exe2 $t/a.o -Wl,--no-allow-shlib-undefined -L$t -lbar >& $t/log || false
-grep -Fq 'undefined symbol: foo' $t/log
+not $CC -B. -o $t/exe2 $t/a.o -Wl,--no-allow-shlib-undefined -L$t -lbar >& $t/log
+grep -F 'undefined symbol: foo' $t/log
diff --git a/test/no-eh-frame-header.sh b/test/no-eh-frame-header.sh
index 09a8ae47fc..9da886c3ba 100755
--- a/test/no-eh-frame-header.sh
+++ b/test/no-eh-frame-header.sh
@@ -8,8 +8,5 @@ int main() {
 EOF
 
 $CC -B. -Wl,--no-eh-frame-hdr -Wl,--thread-count=1 -O0 -o $t/exe $t/a.o
-
-readelf -WS $t/exe > $t/log
-! grep -F .eh_frame_hdr $t/log || false
-
+readelf -WS $t/exe | not grep -F .eh_frame_hdr
 $QEMU $t/exe
diff --git a/test/no-quick-exit.sh b/test/no-quick-exit.sh
index 8b9ef1b096..607fb081fd 100755
--- a/test/no-quick-exit.sh
+++ b/test/no-quick-exit.sh
@@ -11,4 +11,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o -Wl,-no-quick-exit
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
diff --git a/test/no-undefined-version.sh b/test/no-undefined-version.sh
index c2b4045fc4..a761eab03d 100755
--- a/test/no-undefined-version.sh
+++ b/test/no-undefined-version.sh
@@ -7,5 +7,5 @@ cat <<EOF | $CC -o $t/b.o -c -xc -
 int main() {}
 EOF
 
-$CC -B. -o $t/exe -Wl,--version-script,$t/a.ver $t/b.o 2> $t/log
-grep -Fq 'a.ver: cannot assign version `ver_x` to symbol `foo`: symbol not found' $t/log
+$CC -B. -o $t/exe -Wl,--version-script,$t/a.ver $t/b.o |&
+  grep -F 'a.ver: cannot assign version `ver_x` to symbol `foo`: symbol not found'
diff --git a/test/nocopyreloc.sh b/test/nocopyreloc.sh
index bcfa044f13..1f7b77befb 100755
--- a/test/nocopyreloc.sh
+++ b/test/nocopyreloc.sh
@@ -27,8 +27,7 @@ int main() {
 EOF
 
 $CC -B. -no-pie -o $t/exe $t/a.so $t/b.o
-$QEMU $t/exe | grep -q '3 5'
+$QEMU $t/exe | grep '3 5'
 
-! $CC -B. -o $t/exe $t/a.so $t/b.o -no-pie -Wl,-z,nocopyreloc 2> $t/log || false
-
-grep -q 'recompile with -fPIC' $t/log
+not $CC -B. -o $t/exe $t/a.so $t/b.o -no-pie -Wl,-z,nocopyreloc |&
+  grep 'recompile with -fPIC'
diff --git a/test/noinhibit-exec.sh b/test/noinhibit-exec.sh
index 766db37abc..94608c3eda 100755
--- a/test/noinhibit-exec.sh
+++ b/test/noinhibit-exec.sh
@@ -7,8 +7,8 @@ EOF
 
 $CC -B. -shared -o $t/b.so $t/a.o
 
-! $CC -B. -o $t/b.so $t/a.o -Wl,-require-defined=no-such-sym >& $t/log1 || false
-grep -q 'undefined symbol: no-such-sym' $t/log1
+not $CC -B. -o $t/b.so $t/a.o -Wl,-require-defined=no-such-sym |&
+  grep 'undefined symbol: no-such-sym'
 
-$CC -B. -shared -o $t/b.o $t/a.o -Wl,-require-defined=no-such-sym -Wl,-noinhibit-exec >& $t/log2
-grep -q 'undefined symbol: no-such-sym' $t/log2
+$CC -B. -shared -o $t/b.o $t/a.o -Wl,-require-defined=no-such-sym,-noinhibit-exec |&
+  grep 'undefined symbol: no-such-sym'
diff --git a/test/non-canonical-plt.sh b/test/non-canonical-plt.sh
index f8f39d7a4f..1f32ebee7e 100755
--- a/test/non-canonical-plt.sh
+++ b/test/non-canonical-plt.sh
@@ -32,7 +32,7 @@ int main() {
 EOF
 
 $CC -B. -no-pie -o $t/exe $t/a.so $t/b.o $t/c.o
-$QEMU $t/exe | grep -q '^1 1 1$'
+$QEMU $t/exe | grep '^1 1 1$'
 
-readelf --dyn-syms $t/exe | grep -q '00000000 .* foo'
-readelf --dyn-syms $t/exe | grep -q '00000000 .* bar'
+readelf --dyn-syms $t/exe | grep '00000000 .* foo'
+readelf --dyn-syms $t/exe | grep '00000000 .* bar'
diff --git a/test/nostdlib.sh b/test/nostdlib.sh
index b97e3466cf..24ec92f88e 100755
--- a/test/nostdlib.sh
+++ b/test/nostdlib.sh
@@ -8,5 +8,5 @@ EOF
 ./mold -o $t/exe $t/a.o
 
 readelf -W --sections $t/exe > $t/log
-! grep -Fq ' .dynsym ' $t/log || false
-! grep -Fq ' .dynstr ' $t/log || false
+not grep -F ' .dynsym ' $t/log
+not grep -F ' .dynstr ' $t/log
diff --git a/test/oformat-binary.sh b/test/oformat-binary.sh
index ae8831e529..5c64b8b47c 100755
--- a/test/oformat-binary.sh
+++ b/test/oformat-binary.sh
@@ -6,9 +6,9 @@ void _start() {}
 EOF
 
 ./mold -o $t/exe $t/a.o --oformat=binary -Ttext=0x4000 -Map=$t/map
-grep -Eq '^\s+0x4000\s+[0-9]+\s+[0-9]+\s+\.text$' $t/map
+grep -E '^\s+0x4000\s+[0-9]+\s+[0-9]+\s+\.text$' $t/map
 
-! grep -Fq .strtab $t/map || false
-! grep -Fq .shstrtab $t/map || false
-! grep -Fq .symtab $t/map || false
-! grep -Fq .comment $t/map || false
+not grep -F .strtab $t/map
+not grep -F .shstrtab $t/map
+not grep -F .symtab $t/map
+not grep -F .comment $t/map
diff --git a/test/omagic.sh b/test/omagic.sh
index 1749574eab..69935ee7f6 100755
--- a/test/omagic.sh
+++ b/test/omagic.sh
@@ -13,4 +13,4 @@ int main() {
 EOF
 
 $CC -B. $t/a.o -o $t/exe -static -Wl,--omagic
-readelf -W --segments $t/exe | grep -qw RWE
+readelf -W --segments $t/exe | grep -w RWE
diff --git a/test/package-metadata.sh b/test/package-metadata.sh
index f766b9f51b..4b2d987d3c 100755
--- a/test/package-metadata.sh
+++ b/test/package-metadata.sh
@@ -9,10 +9,10 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o -Wl,-package-metadata='{"foo":"bar"}'
-readelf -x .note.package $t/exe1 | grep -Fq '{"foo":"bar"}'
+readelf -x .note.package $t/exe1 | grep -F '{"foo":"bar"}'
 
-$CC -B. -o $t/exe2 $t/a.o -Wl,--encoded-package-metadata=%7B%22foo%22%3A%22bar%22%7D
-readelf -x .note.package $t/exe2 | grep -Fq '{"foo":"bar"}'
+$CC -B. -o $t/exe2 $t/a.o -Wl,--package-metadata='%7B%22foo%22%3A%22bar%22%7D'
+readelf -x .note.package $t/exe2 | grep -F '{"foo":"bar"}'
 
-! $CC -B. -o $t/exe3 $t/a.o -Wl,--encoded-package-metadata=foo%x >& $t/log
-grep -q 'invalid string: foo%x' $t/log
+not $CC -B. -o $t/exe3 $t/a.o -Wl,--package-metadata='foo%x' |&
+  grep 'invalid string: foo%x'
diff --git a/test/physical-image-base.sh b/test/physical-image-base.sh
index 63ac3e1572..bfbe46177a 100755
--- a/test/physical-image-base.sh
+++ b/test/physical-image-base.sh
@@ -3,8 +3,8 @@
 
 [ $MACHINE = ppc64 ] && skip
 
-# BusyBox's grep can't handle capture groups (e.g. \1, \2 ...)
-grep --version 2>&1 | grep BusyBox && skip
+# Test if grep supports backreferences
+echo abab | grep -E '(ab)\1' || skip
 
 cat <<EOF | $CC -o $t/a.o -c -xc -
 #include <stdio.h>
@@ -20,14 +20,14 @@ EOF
 $CC -B. -no-pie -o $t/exe1 $t/a.o -Wl,--image-base=0x200000 \
    -Wl,--physical-image-base=0x800000
 
-$QEMU $t/exe1 | grep -q 'Hello world'
+$QEMU $t/exe1 | grep 'Hello world'
 
-readelf -W --segments $t/exe1 | grep -Eq 'LOAD\s+0x000000 0x0*200000 0x0*800000'
-readelf -Ws $t/exe1 | grep -q __phys_start_foo
+readelf -W --segments $t/exe1 | grep -E 'LOAD\s+0x000000 0x0*200000 0x0*800000'
+readelf -Ws $t/exe1 | grep __phys_start_foo
 
 
 $CC -B. -no-pie -o $t/exe2 $t/a.o -Wl,--physical-image-base=0x800000 \
   -Wl,--section-order='=0x800000 TEXT RODATA =0x900000 DATA BSS'
 
-readelf -W --segments $t/exe2 | grep -Eq 'LOAD\s+\S+\s+(\S+)\s\1.*R E 0'
-readelf -W --segments $t/exe2 | grep -Eq 'LOAD\s+\S+\s+(\S+)\s\1.*R   0'
+readelf -W --segments $t/exe2 | grep -E 'LOAD\s+\S+\s+(\S+)\s\1.*R E 0'
+readelf -W --segments $t/exe2 | grep -E 'LOAD\s+\S+\s+(\S+)\s\1.*R   0'
diff --git a/test/pie.sh b/test/pie.sh
index 3165bd08c3..c7f057620e 100755
--- a/test/pie.sh
+++ b/test/pie.sh
@@ -11,5 +11,5 @@ int main() {
 EOF
 
 $CC -B. -pie -o $t/exe $t/a.o
-readelf --file-header $t/exe | grep -q -E '(Shared object file|Position-Independent Executable file)'
-$QEMU $t/exe | grep -q 'Hello world'
+readelf --file-header $t/exe | grep -E '(Shared object file|Position-Independent Executable file)'
+$QEMU $t/exe | grep 'Hello world'
diff --git a/test/plt-dso.sh b/test/plt-dso.sh
index c58294e9a0..9ff208a66c 100755
--- a/test/plt-dso.sh
+++ b/test/plt-dso.sh
@@ -35,4 +35,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe -Wl,-rpath=$t $t/c.o $t/b.so
-$QEMU $t/exe | grep -q 'Hello WORLD'
+$QEMU $t/exe | grep 'Hello WORLD'
diff --git a/test/plt-symbols.sh b/test/plt-symbols.sh
new file mode 100755
index 0000000000..fd372f7545
--- /dev/null
+++ b/test/plt-symbols.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -c -o $t/a.o -xc - -fPIC
+void bar();
+void foo() { bar(); }
+EOF
+
+$CC -B. -shared -o $t/b.so $t/a.o
+readelf -Ws $t/b.so | grep 'LOCAL.*bar\$plt$'
diff --git a/test/pltgot.sh b/test/pltgot.sh
index b620f0c02a..c7c671f90c 100755
--- a/test/pltgot.sh
+++ b/test/pltgot.sh
@@ -23,4 +23,4 @@ int main() { hello(); }
 EOF
 
 $CC -B. -o $t/exe $t/c.o $t/b.so
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
diff --git a/test/preinit-array.sh b/test/preinit-array.sh
index a3a154697e..340ae1958e 100755
--- a/test/preinit-array.sh
+++ b/test/preinit-array.sh
@@ -7,8 +7,7 @@ void _start() {}
 EOF
 
 ./mold -o $t/exe1 $t/a.o
-readelf -W --dynamic $t/exe1 > $t/log1
-! grep -q PREINIT_ARRAY $t/log1 || false
+readelf -W --dynamic $t/exe1 | not grep PREINIT_ARRAY
 
 cat <<EOF | $CC -o $t/b.o -c -xc -
 void preinit_fn() {}
@@ -20,4 +19,4 @@ EOF
 
 # We create a PREINIT_ARRAY .dynamic entry if necessary
 $CC -B. -o $t/exe2 $t/b.o
-readelf -W --dynamic $t/exe2 | grep -q PREINIT_ARRAY
+readelf -W --dynamic $t/exe2 | grep PREINIT_ARRAY
diff --git a/test/print-dependencies.sh b/test/print-dependencies.sh
index 313cbd6bcc..bf8d1c0e4a 100755
--- a/test/print-dependencies.sh
+++ b/test/print-dependencies.sh
@@ -10,5 +10,4 @@ void foo();
 int main() { foo(); }
 EOF
 
-! $CC -B. -o $t/exe $t/a.o $t/b.o -Wl,--print-dependencies > $t/log 2> /dev/null
-grep -q 'b\.o.*a\.o.*foo$' $t/log
+$CC -B. -o $t/exe $t/a.o $t/b.o -Wl,--print-dependencies | grep 'b\.o.*a\.o.*foo$'
diff --git a/test/protected-dynsym.sh b/test/protected-dynsym.sh
index ca2da27c69..c245f4a7e3 100755
--- a/test/protected-dynsym.sh
+++ b/test/protected-dynsym.sh
@@ -10,11 +10,11 @@ __attribute__((visibility("protected"))) int foo;
 EOF
 
 $CC -B. -shared -o $t/c.so $t/a.o $t/b.o -Wl,-strip-all
-readelf --symbols $t/c.so | grep -Eq 'PROTECTED\b.*\bfoo\b'
+readelf --symbols $t/c.so | grep -E 'PROTECTED\b.*\bfoo\b'
 
 cat <<EOF | $CC -fPIC -c -o $t/d.o -fno-common -xc -
 __attribute__((visibility("protected"))) int foo;
 EOF
 
 $CC -B. -shared -o $t/e.so $t/a.o $t/d.o -Wl,-strip-all
-readelf --symbols $t/e.so | grep -Eq 'PROTECTED\b.*\bfoo\b'
+readelf --symbols $t/e.so | grep -E 'PROTECTED\b.*\bfoo\b'
diff --git a/test/protected.sh b/test/protected.sh
index fec6bd4229..fda2186ba4 100755
--- a/test/protected.sh
+++ b/test/protected.sh
@@ -38,4 +38,4 @@ int main() {
 EOF
 
 $CC -B. -no-pie -o $t/exe $t/c.o $t/b.so
-$QEMU $t/exe 2> /dev/null | grep -q '3 4 0'
+$QEMU $t/exe 2> /dev/null | grep '3 4 0'
diff --git a/test/push-pop-state.sh b/test/push-pop-state.sh
index 496893c906..a77678a652 100755
--- a/test/push-pop-state.sh
+++ b/test/push-pop-state.sh
@@ -17,5 +17,5 @@ $CC -B. -o $t/exe $t/c.o -Wl,-as-needed \
   -Wl,-push-state -Wl,-no-as-needed $t/a.so -Wl,-pop-state $t/b.so
 
 readelf --dynamic $t/exe > $t/log
-grep -Fq a.so $t/log
-! grep -Fq b.so $t/log || false
+grep -F a.so $t/log
+not grep -F b.so $t/log
diff --git a/test/range-extension-thunk.sh b/test/range-extension-thunk.sh
index 065287dd69..7fd00816aa 100755
--- a/test/range-extension-thunk.sh
+++ b/test/range-extension-thunk.sh
@@ -49,11 +49,11 @@ $CC -c -o $t/d.o $t/b.c -O0
 
 $CC -B. -o $t/exe1 $t/c.o $t/d.o \
   -Wl,--section-start=.low=0x10000000,--section-start=.high=0x20000000
-$QEMU $t/exe1 | grep -q 'main fn1 fn3 fn2 fn4'
+$QEMU $t/exe1 | grep 'main fn1 fn3 fn2 fn4'
 
 $CC -c -o $t/e.o $t/a.c -O2
 $CC -c -o $t/f.o $t/b.c -O2
 
 $CC -B. -o $t/exe2 $t/e.o $t/f.o \
   -Wl,--section-start=.low=0x10000000,--section-start=.high=0x20000000
-$QEMU $t/exe2 | grep -q 'main fn1 fn3 fn2 fn4'
+$QEMU $t/exe2 | grep 'main fn1 fn3 fn2 fn4'
diff --git a/test/range-extension-thunk2.sh b/test/range-extension-thunk2.sh
index 758f6afcc8..4eb585f8a1 100755
--- a/test/range-extension-thunk2.sh
+++ b/test/range-extension-thunk2.sh
@@ -4,28 +4,28 @@
 cat <<EOF | $CC -o $t/a.o -c -xc - -ffunction-sections
 #include <stdio.h>
 
-void f0();
-void f1();
-void f2();
-void f3();
-void f4();
-void f5();
-void f6();
-void f7();
-void f8();
-void f9();
-void f10();
-void f11();
-void f12();
-void f13();
-void f14();
-void f15();
-void f16();
-void f17();
-void f18();
-void f19();
-
-void f0(int x) { printf("0 "); if (!x) f9(); }
+void f0(int);
+void f1(int);
+void f2(int);
+void f3(int);
+void f4(int);
+void f5(int);
+void f6(int);
+void f7(int);
+void f8(int);
+void f9(int);
+void f10(int);
+void f11(int);
+void f12(int);
+void f13(int);
+void f14(int);
+void f15(int);
+void f16(int);
+void f17(int);
+void f18(int);
+void f19(int);
+
+void f0(int x) { printf("0 "); if (!x) f9(x); }
 void space0() { __asm__(".space 1024*1024"); }
 
 void f1(int x) { printf("1 "); f8(x); }
@@ -92,4 +92,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o
-$QEMU $t/exe | grep -Eq '^0 9 1 8 2 7 3 6 4 5 10 19 11 18 12 17 13 16 14 15 0 $'
+$QEMU $t/exe | grep -E '^0 9 1 8 2 7 3 6 4 5 10 19 11 18 12 17 13 16 14 15 0 $'
diff --git a/test/relax-got-load.sh b/test/relax-got-load.sh
index 27c125062d..a75684a59b 100755
--- a/test/relax-got-load.sh
+++ b/test/relax-got-load.sh
@@ -13,5 +13,17 @@ void hello();
 int main() { hello(); }
 EOF
 
-$CC -B. -o $t/exe $t/a.o $t/b.o
-$QEMU $t/exe | grep -q 'Hello world'
+$CC -B. -o $t/exe1 $t/a.o $t/b.o
+$QEMU $t/exe1 | grep 'Hello world'
+
+$CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,--no-relax
+$QEMU $t/exe2 | grep 'Hello world'
+
+# On x86, GOTPCRELX is relaxed even with --no-relax
+case $MACHINE in
+aarch64 | riscv64 | s390x | loongarch64)
+  $OBJDUMP -d $t/exe1 | grep -v exe1 > $t/log1
+  $OBJDUMP -d $t/exe2 | grep -v exe2 > $t/log2
+  not diff $t/log1 $t/log2 > /dev/null
+  ;;
+esac
diff --git a/test/reloc-rodata.sh b/test/reloc-rodata.sh
index 5f73fecd4f..53d5d0ff85 100755
--- a/test/reloc-rodata.sh
+++ b/test/reloc-rodata.sh
@@ -15,5 +15,5 @@ int main() {
 }
 EOF
 
-! $CC -B. -o $t/exe $t/a.o -pie >& $t/log
-grep -Eq 'relocation against symbol .+ can not be used; recompile with -fPIC' $t/log
+not $CC -B. -o $t/exe $t/a.o -pie |&
+  grep -E 'relocation against symbol .+ can not be used; recompile with -fPIC'
diff --git a/test/relocatable-archive.sh b/test/relocatable-archive.sh
index b6e0166a2f..9042d011a5 100755
--- a/test/relocatable-archive.sh
+++ b/test/relocatable-archive.sh
@@ -2,7 +2,7 @@
 . $(dirname $0)/common.inc
 
 # OneTBB isn't tsan-clean
-nm mold | grep -q '__tsan_init' && skip
+nm mold | grep '__tsan_init' && skip
 
 cat <<EOF | $CC -c -o $t/a.o -xc -
 void bar();
@@ -30,6 +30,6 @@ ar crs $t/e.a $t/a.o $t/b.o $t/c.o
 ./mold -r -o $t/f.o $t/d.o $t/e.a
 
 readelf --symbols $t/f.o > $t/log
-grep -q 'foo\b' $t/log
-grep -q 'bar\b' $t/log
-! grep -q 'baz\b' $t/log || false
+grep 'foo\b' $t/log
+grep 'bar\b' $t/log
+not grep 'baz\b' $t/log
diff --git a/test/relocatable-c++.sh b/test/relocatable-c++.sh
index 6ce5e7be3e..5487deb3f9 100755
--- a/test/relocatable-c++.sh
+++ b/test/relocatable-c++.sh
@@ -2,7 +2,7 @@
 . $(dirname $0)/common.inc
 
 # OneTBB isn't tsan-clean
-nm mold | grep -q '__tsan_init' && skip
+nm mold | grep '__tsan_init' && skip
 
 cat <<EOF | $CXX -c -o $t/a.o -xc++ -
 void hello();
@@ -40,4 +40,4 @@ EOF
 ./mold --relocatable -o $t/d.o $t/b.o
 
 $CXX -B. -o $t/exe $t/c.o $t/d.o
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
diff --git a/test/relocatable-compressed-debug-info.sh b/test/relocatable-compressed-debug-info.sh
index 02954c7aed..5a9103adea 100755
--- a/test/relocatable-compressed-debug-info.sh
+++ b/test/relocatable-compressed-debug-info.sh
@@ -2,9 +2,9 @@
 . $(dirname $0)/common.inc
 
 # OneTBB isn't tsan-clean
-nm mold | grep -q '__tsan_init' && skip
+nm mold | grep '__tsan_init' && skip
 
-echo 'int main() {}' | $GCC -o /dev/null -xc -g3 -gz - >& /dev/null || skip
+test_cflags -g3 -gz || skip
 
 cat <<EOF | $GCC -c -o $t/a.o -xc - -g3 -gz
 #include <stdio.h>
@@ -18,4 +18,4 @@ EOF
 
 ./mold --relocatable -o $t/c.o $t/a.o $t/b.o
 $CC -B. -o $t/exe $t/c.o
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
diff --git a/test/relocatable-debug-info.sh b/test/relocatable-debug-info.sh
index 8cb3326cad..18cf03237d 100755
--- a/test/relocatable-debug-info.sh
+++ b/test/relocatable-debug-info.sh
@@ -2,7 +2,7 @@
 . $(dirname $0)/common.inc
 
 # OneTBB isn't tsan-clean
-nm mold | grep -q '__tsan_init' && skip
+nm mold | grep '__tsan_init' && skip
 
 cat <<EOF | $CC -c -o $t/a.o -xc - -g
 #include <stdio.h>
@@ -16,12 +16,11 @@ EOF
 
 # It looks like objdump prints out a warning message for
 # object files compiled with Clang.
-$OBJDUMP --dwarf=info $t/a.o $t/b.o 2>&1 | grep -q 'Warning: DIE at offset' && skip
+$OBJDUMP --dwarf=info $t/a.o $t/b.o |& grep 'Warning: DIE at offset' && skip
 
 ./mold --relocatable -o $t/c.o $t/a.o $t/b.o
 
 $CC -B. -o $t/exe $t/c.o
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
 
-$OBJDUMP --dwarf=info $t/c.o > /dev/null 2> $t/log
-! grep -q Warning $t/log || false
+$OBJDUMP --dwarf=info $t/c.o > /dev/null |& not grep Warning
diff --git a/test/relocatable-exception.sh b/test/relocatable-exception.sh
index 52411c7e54..ac4dd40965 100755
--- a/test/relocatable-exception.sh
+++ b/test/relocatable-exception.sh
@@ -3,9 +3,10 @@
 
 [ $MACHINE = m68k ] && skip
 [ $MACHINE = sh4 ] && skip
+[ $MACHINE = sh4aeb ] && skip
 
 # OneTBB isn't tsan-clean
-nm mold | grep -q '__tsan_init' && skip
+nm mold | grep '__tsan_init' && skip
 
 cat <<EOF | $CXX -c -o $t/a.o -xc++ -
 int foo() {
diff --git a/test/relocatable-many-sections.sh b/test/relocatable-many-sections.sh
index 1ead16020f..bd5b67a82c 100755
--- a/test/relocatable-many-sections.sh
+++ b/test/relocatable-many-sections.sh
@@ -2,9 +2,9 @@
 . $(dirname $0)/common.inc
 
 # OneTBB isn't tsan-clean
-nm mold | grep -q '__tsan_init' && skip
+nm mold | grep '__tsan_init' && skip
 
-seq 1 80000 | sed 's/.*/.section .data.\0,"aw"\n.word 0\n/g' | \
+seq 1 80000 | sed 's/.*/.section .data.\0,"aw"\n.word 0\n/g' |
   $CC -c -xassembler -o $t/a.o -
 
 cat <<'EOF' | $CC -c -xc -o $t/b.o -
@@ -18,4 +18,4 @@ EOF
 
 ./mold -r -o $t/c.o $t/a.o $t/b.o
 $CC -B. -o $t/exe $t/c.o
-$QEMU $t/exe | grep -q Hello
+$QEMU $t/exe | grep Hello
diff --git a/test/relocatable-merge-sections.sh b/test/relocatable-merge-sections.sh
index 04dcd1d31a..a7c0ae50c2 100755
--- a/test/relocatable-merge-sections.sh
+++ b/test/relocatable-merge-sections.sh
@@ -2,7 +2,7 @@
 . $(dirname $0)/common.inc
 
 # OneTBB isn't tsan-clean
-nm mold | grep -q '__tsan_init' && skip
+nm mold | grep '__tsan_init' && skip
 
 cat <<EOF | $CC -c -o $t/a.o -xc -ffunction-sections -
 void foo() {}
@@ -11,10 +11,10 @@ EOF
 
 ./mold --relocatable -o $t/b.o $t/a.o
 readelf -WS $t/b.o > $t/log1
-grep -Fq .text.foo $t/log1
-grep -Fq .text.bar $t/log1
+grep -F .text.foo $t/log1
+grep -F .text.bar $t/log1
 
 ./mold --relocatable -o $t/c.o $t/a.o --relocatable-merge-sections
 readelf -WS $t/c.o > $t/log2
-! grep -Fq .text.foo $t/log2 || false
-! grep -Fq .text.bar $t/log2 || false
+not grep -F .text.foo $t/log2
+not grep -F .text.bar $t/log2
diff --git a/test/relocatable-mergeable-sections.sh b/test/relocatable-mergeable-sections.sh
index c393613ee0..e63e134a95 100755
--- a/test/relocatable-mergeable-sections.sh
+++ b/test/relocatable-mergeable-sections.sh
@@ -2,7 +2,7 @@
 . $(dirname $0)/common.inc
 
 # OneTBB isn't tsan-clean
-nm mold | grep -q '__tsan_init' && skip
+nm mold | grep '__tsan_init' && skip
 
 cat <<EOF | $CC -c -o $t/a.o -xassembler -
 .section .rodata.str1.1,"aMS",@progbits,1
@@ -16,5 +16,5 @@ EOF
 
 ./mold --relocatable -o $t/b.o $t/a.o
 
-readelf -W -p .rodata.str1.1 $t/b.o | grep -Eq '\b0\b.*Hello'
-readelf -W -p .rodata.str1.1 $t/b.o | grep -Eq '\b7\b.*World'
+readelf -W -p .rodata.str1.1 $t/b.o | grep -E '\b0\b.*Hello'
+readelf -W -p .rodata.str1.1 $t/b.o | grep -E '\b7\b.*World'
diff --git a/test/relocatable.sh b/test/relocatable.sh
index 2a9353f183..114181b5b3 100755
--- a/test/relocatable.sh
+++ b/test/relocatable.sh
@@ -2,7 +2,7 @@
 . $(dirname $0)/common.inc
 
 # OneTBB isn't tsan-clean
-nm mold | grep -q '__tsan_init' && skip
+nm mold | grep '__tsan_init' && skip
 
 cat <<EOF | $CC -c -o $t/a.o -xc -
 #include <stdio.h>
diff --git a/test/relro.sh b/test/relro.sh
index 722f363538..9ba49586e0 100755
--- a/test/relro.sh
+++ b/test/relro.sh
@@ -9,16 +9,13 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o -Wl,-z,relro,-z,lazy
-$QEMU $t/exe1 | grep -q 'Hello world'
-readelf --segments -W $t/exe1 > $t/log1
-grep -q 'GNU_RELRO ' $t/log1
+$QEMU $t/exe1 | grep 'Hello world'
+readelf --segments -W $t/exe1 | grep -w GNU_RELRO
 
 $CC -B. -o $t/exe2 $t/a.o -Wl,-z,relro,-z,now
-$QEMU $t/exe2 | grep -q 'Hello world'
-readelf --segments -W $t/exe2 > $t/log2
-grep -q 'GNU_RELRO ' $t/log2
+$QEMU $t/exe2 | grep 'Hello world'
+readelf --segments -W $t/exe2 | grep -w GNU_RELRO
 
 $CC -B. -o $t/exe3 $t/a.o -Wl,-z,norelro
-$QEMU $t/exe3 | grep -q 'Hello world'
-readelf --segments -W $t/exe3 > $t/log3
-! grep -q 'GNU_RELRO ' $t/log3 || false
+$QEMU $t/exe3 | grep 'Hello world'
+readelf --segments -W $t/exe3 | not grep -w GNU_RELRO
diff --git a/test/repro.sh b/test/repro.sh
index f8b84a71d6..25b20dafd5 100755
--- a/test/repro.sh
+++ b/test/repro.sh
@@ -13,19 +13,19 @@ EOF
 rm -rf $t/exe.repro $t/exe.repro.tar
 
 $CC -B. -o $t/exe $t/a.o
-! [ -f $t/exe.repro.tar ] || false
+not [ -f $t/exe.repro.tar ]
 
 $CC -B. -o $t/exe $t/a.o -Wl,-repro
 
 tar -C $t -xf $t/exe.repro.tar
-tar -C $t -tvf $t/exe.repro.tar | grep -q ' exe.repro/.*/a.o'
-grep -q /a.o  $t/exe.repro/response.txt
-grep -q mold $t/exe.repro/version.txt
+tar -C $t -tvf $t/exe.repro.tar | grep ' exe.repro/.*/a.o'
+grep /a.o  $t/exe.repro/response.txt
+grep mold $t/exe.repro/version.txt
 
 rm -rf $t/exe.repro $t/exe.repro.tar
 
 MOLD_REPRO=1 $CC -B. -o $t/exe $t/a.o
-tar -C $t -tvf $t/exe.repro.tar | grep -q ' exe.repro/.*/a.o'
+tar -C $t -tvf $t/exe.repro.tar | grep ' exe.repro/.*/a.o'
 tar -C $t -xf $t/exe.repro.tar
-grep -q /a.o  $t/exe.repro/response.txt
-grep -q mold $t/exe.repro/version.txt
+grep /a.o  $t/exe.repro/response.txt
+grep mold $t/exe.repro/version.txt
diff --git a/test/require-defined.sh b/test/require-defined.sh
index 5fedee6dd1..642d860740 100755
--- a/test/require-defined.sh
+++ b/test/require-defined.sh
@@ -13,10 +13,10 @@ int main() {}
 EOF
 
 $CC -B. -o $t/exe $t/c.o $t/b.a
-! readelf --symbols $t/exe | grep -q foobar || false
+readelf --symbols $t/exe | not grep foobar
 
 $CC -B. -o $t/exe $t/c.o $t/b.a -Wl,-require-defined,foobar
-readelf --symbols $t/exe | grep -q foobar
+readelf --symbols $t/exe | grep foobar
 
-! $CC -B. -o $t/exe $t/c.o $t/b.a -Wl,-require-defined,xyz >& $t/log
-grep -q 'undefined symbol: xyz' $t/log
+not $CC -B. -o $t/exe $t/c.o $t/b.a -Wl,-require-defined,xyz |&
+  grep 'undefined symbol: xyz'
diff --git a/test/retain-symbols-file.sh b/test/retain-symbols-file.sh
index dba11d3c86..43e6a1e1dd 100755
--- a/test/retain-symbols-file.sh
+++ b/test/retain-symbols-file.sh
@@ -16,8 +16,8 @@ EOF
 $CC -B. -o $t/exe $t/a.o -Wl,--retain-symbols-file=$t/symbols
 readelf -W --symbols $t/exe > $t/log
 
-! grep -q ' foo$' $t/log || false
-! grep -q ' bar$' $t/log || false
-! grep -q ' main$' $t/log || false
+not grep ' foo$' $t/log
+not grep ' bar$' $t/log
+not grep ' main$' $t/log
 
-grep -q ' baz$' $t/log
+grep ' baz$' $t/log
diff --git a/test/reverse-sections.sh b/test/reverse-sections.sh
index d834259669..bd9d81d5cc 100755
--- a/test/reverse-sections.sh
+++ b/test/reverse-sections.sh
@@ -37,7 +37,7 @@ int main() { printf("\n"); }
 EOF
 
 $CXX -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o
-$QEMU $t/exe1 | grep -q 'foo1 foo2 foo3 foo4 foo5 foo6'
+$QEMU $t/exe1 | grep 'foo1 foo2 foo3 foo4 foo5 foo6'
 
 $CXX -B. -o $t/exe2 $t/a.o $t/b.o $t/c.o -Wl,--reverse-sections
-$QEMU $t/exe2 | grep -q 'foo5 foo6 foo3 foo4 foo1 foo2'
+$QEMU $t/exe2 | grep 'foo5 foo6 foo3 foo4 foo1 foo2'
diff --git a/test/rodata-name.sh b/test/rodata-name.sh
index def5a6fdc2..679d95df3d 100755
--- a/test/rodata-name.sh
+++ b/test/rodata-name.sh
@@ -5,6 +5,10 @@
 # Concretely speaking, ARM as uses "@" as a start of a comment.
 [ $MACHINE = arm ] && skip
 
+# All data symbols need to be aligned to 2 byte boundaries on s390x,
+# so rodata.str1.1 in this file is invalid on s390x.
+[ $MACHINE = s390x ] && skip
+
 cat <<'EOF' | $CC -c -o $t/a.o -x assembler -
 .globl val1, val2, val3, val4, val5
 
@@ -43,8 +47,8 @@ EOF
 
 $CC -B. -o $t/exe $t/a.o $t/b.o
 
-readelf -p .rodata.str1.1 $t/exe | grep -q Hello
-readelf -p .rodata.str4.4 $t/exe | grep -q world
-readelf -p .rodata.str1.1 $t/exe | grep -q foobar
-readelf -p .rodata.cst8 $t/exe | grep -q abcdefgh
-readelf -p .rodatabaz $t/exe | grep -q baz
+readelf -p .rodata.str1.1 $t/exe | grep Hello
+readelf -p .rodata.str4.4 $t/exe | grep world
+readelf -p .rodata.str1.1 $t/exe | grep foobar
+readelf -p .rodata.cst8 $t/exe | grep abcdefgh
+readelf -p .rodatabaz $t/exe | grep baz
diff --git a/test/rosegment.sh b/test/rosegment.sh
index f9a8e3a6a4..9a5a825dcb 100755
--- a/test/rosegment.sh
+++ b/test/rosegment.sh
@@ -7,13 +7,10 @@ int main() { printf("Hello world\n"); }
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o
-readelf -W --segments $t/exe1 > $t/log1
-! grep -q '\.interp .* \.text' $t/log1 || false
+readelf -W --segments $t/exe1 | not grep '\.interp .* \.text'
 
 $CC -B. -o $t/exe2 $t/a.o -Wl,--rosegment
-readelf -W --segments $t/exe2 > $t/log2
-! grep -q '\.interp .* \.text' $t/log2 || false
+readelf -W --segments $t/exe2 | not grep '\.interp .* \.text'
 
 $CC -B. -o $t/exe3 $t/a.o -Wl,--no-rosegment
-readelf -W --segments $t/exe3 > $t/log3
-grep -q '\.interp .* \.text' $t/log3
+readelf -W --segments $t/exe3 | grep '\.interp .* \.text'
diff --git a/test/rpath.sh b/test/rpath.sh
index 83cf3c0ac8..413d5304ed 100755
--- a/test/rpath.sh
+++ b/test/rpath.sh
@@ -9,7 +9,7 @@ main:
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o -Wl,-rpath,/foo,-rpath,/bar,-R/no/such/directory,-R/
-readelf --dynamic $t/exe1 | grep -Fq 'Library runpath: [/foo:/bar:/no/such/directory:/]'
+readelf --dynamic $t/exe1 | grep -F 'Library runpath: [/foo:/bar:/no/such/directory:/]'
 
 $CC -B. -o $t/exe2 $t/a.o -Wl,-rpath,/foo,-rpath,/bar,-rpath,/foo,-rpath,/baz
-readelf --dynamic $t/exe2 | grep -Fq 'Library runpath: [/foo:/bar:/baz]'
+readelf --dynamic $t/exe2 | grep -F 'Library runpath: [/foo:/bar:/baz]'
diff --git a/test/run-clang.sh b/test/run-clang.sh
index b6ce86d013..19d03bd28c 100755
--- a/test/run-clang.sh
+++ b/test/run-clang.sh
@@ -4,7 +4,7 @@
 [ "$CC" = cc ] || skip
 
 # ASAN doesn't work with LD_PRELOAD
-nm mold-wrapper.so | grep -q '__[at]san_init' && skip
+nm mold-wrapper.so | grep '__[at]san_init' && skip
 
 clang --version >& /dev/null || skip
 
@@ -19,5 +19,5 @@ EOF
 
 LD_PRELOAD=`pwd`/mold-wrapper.so MOLD_PATH=`pwd`/mold \
   clang -no-pie -o $t/exe $t/a.o -fuse-ld=/usr/bin/ld
-readelf -p .comment $t/exe > $t/log
-grep -q mold $t/log
+
+readelf -p .comment $t/exe | grep mold
diff --git a/test/run.sh b/test/run.sh
index e62576362c..d9c7a76d93 100755
--- a/test/run.sh
+++ b/test/run.sh
@@ -4,7 +4,7 @@
 [ "$CC" = cc ] || skip
 
 # ASAN doesn't work with LD_PRELOAD
-nm mold | grep -q '__[at]san_init' && skip
+nm mold | grep '__[at]san_init' && skip
 
 cat <<'EOF' | $CC -xc -c -o $t/a.o -
 #include <stdio.h>
@@ -17,24 +17,24 @@ EOF
 
 LD_PRELOAD=`pwd`/mold-wrapper.so MOLD_PATH=`pwd`/mold \
   $CC -o $t/exe $t/a.o -B/usr/bin
-readelf -p .comment $t/exe > $t/log
-grep -q mold $t/log
 
-./mold -run env | grep -q '^MOLD_PATH=.*/mold$'
+readelf -p .comment $t/exe | grep mold
 
-./mold -run /usr/bin/ld --version | grep -q mold
-./mold -run /usr/bin/ld.lld --version | grep -q mold
-./mold -run /usr/bin/ld.gold --version | grep -q mold
+./mold -run env | grep '^MOLD_PATH=.*/mold$'
+
+./mold -run /usr/bin/ld --version | grep mold
+./mold -run /usr/bin/ld.lld --version | grep mold
+./mold -run /usr/bin/ld.gold --version | grep mold
 
 rm -f $t/ld $t/ld.lld $t/ld.gold $t/foo.ld
 touch $t/ld $t/ld.lld $t/ld.gold
 echo "#!/bin/sh" >$t/foo.ld
 chmod 755 $t/ld $t/ld.lld $t/ld.gold $t/foo.ld
 
-./mold -run $t/ld --version | grep -q mold
-./mold -run $t/ld.lld --version | grep -q mold
-./mold -run $t/ld.gold --version | grep -q mold
-./mold -run $t/foo.ld --version | grep -q mold && false
+./mold -run $t/ld --version | grep mold
+./mold -run $t/ld.lld --version | grep mold
+./mold -run $t/ld.gold --version | grep mold
+./mold -run $t/foo.ld --version | not grep mold
 
 cat <<'EOF' > $t/sh
 #!/bin/sh
@@ -43,10 +43,10 @@ EOF
 
 chmod 755 $t/sh
 
-./mold -run $t/sh ld --version | grep -q mold
-./mold -run $t/sh foo.ld --version >& /dev/null | grep -q mold && false
+./mold -run $t/sh ld --version | grep mold
+./mold -run $t/sh $t/foo.ld --version |& not grep mold
 
-./mold -run $t/sh $t/ld --version | grep -q mold
-./mold -run $t/sh $t/ld.lld --version | grep -q mold
-./mold -run $t/sh $t/ld.gold --version | grep -q mold
-./mold -run $t/sh $t/foo.ld --version | grep -q mold && false
+./mold -run $t/sh $t/ld --version | grep mold
+./mold -run $t/sh $t/ld.lld --version | grep mold
+./mold -run $t/sh $t/ld.gold --version | grep mold
+./mold -run $t/sh $t/foo.ld --version | not grep mold
diff --git a/test/section-align.sh b/test/section-align.sh
index c0c985ed23..97d75e86eb 100755
--- a/test/section-align.sh
+++ b/test/section-align.sh
@@ -7,10 +7,10 @@ int main() {}
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o -Wl,--section-align=.foo=0x2000
-readelf -WS $t/exe1 | grep -q '\.foo.* 8192$'
+readelf -WS $t/exe1 | grep '\.foo.* 8192$'
 
 $CC -B. -o $t/exe2 $t/a.o -Wl,--section-align=.foo=256
-readelf -WS $t/exe2 | grep -q '\.foo.* 256$'
+readelf -WS $t/exe2 | grep '\.foo.* 256$'
 
-! $CC -B. -o $t/exe3 $t/a.o -Wl,--section-align=.foo=3 2>&1 | \
-  grep -q 'must be a power of 2'
+not $CC -B. -o $t/exe3 $t/a.o -Wl,--section-align=.foo=3 |&
+  grep 'must be a power of 2'
diff --git a/test/section-attributes.sh b/test/section-attributes.sh
index 63eacf3582..5f38f3799b 100755
--- a/test/section-attributes.sh
+++ b/test/section-attributes.sh
@@ -21,4 +21,4 @@ int main() {}
 EOF
 
 $CC -B. -o $t/exe $t/a.o $t/b.o $t/c.o $t/d.o
-readelf -W --sections $t/exe | grep -q 'foobar.*WAX'
+readelf -W --sections $t/exe | grep 'foobar.*WAX'
diff --git a/test/section-order.sh b/test/section-order.sh
index 989089cd4d..6b7d728e46 100755
--- a/test/section-order.sh
+++ b/test/section-order.sh
@@ -1,10 +1,6 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
-# qemu crashes if the ELF header is not mapped to memory
-on_qemu && skip
-[ "$(uname)" = FreeBSD ] && skip
-
 cat <<EOF | $CC -o $t/a.o -c -xc -fno-PIC $flags -
 #include <stdio.h>
 
@@ -17,28 +13,27 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o -no-pie \
-  -Wl,--section-order='=0x100000 PHDR =0x200000 .fn2 TEXT =0x300000 .fn1 DATA BSS RODATA'
-$QEMU $t/exe1 | grep -q Hello
+  -Wl,--section-order='=0x100000 EHDR PHDR =0x200000 .fn2 TEXT =0x300000 .fn1 DATA BSS RODATA'
+$QEMU $t/exe1 | grep Hello
 
-readelf -SW $t/exe1 | grep -q '\.fn2 .*00200000'
-readelf -SW $t/exe1 | grep -q '\.fn1 .*00300000'
-readelf -sw $t/exe1 | grep -Eq ': 0+\s.*\s__ehdr_start$'
+readelf -SW $t/exe1 | grep '\.fn2 .*00200000'
+readelf -SW $t/exe1 | grep '\.fn1 .*00300000'
 
 
 $CC -B. -o $t/exe2 $t/a.o -no-pie \
   -Wl,--section-order='=0x200000 EHDR RODATA =0x300000 PHDR =0x400000 .fn2 TEXT DATA BSS'
 
-readelf -SW $t/exe2 | grep -q '\.fn2 .*00400000'
-readelf -sW $t/exe2 | grep -Eq ': 0+200000\s.*\s__ehdr_start$'
-readelf -W --segments $t/exe2 | grep -Eq 'PHDR\s.*0x0+300000\s'
+readelf -SW $t/exe2 | grep '\.fn2 .*00400000'
+readelf -sW $t/exe2 | grep -E ': 0+200000\s.*\s__ehdr_start$'
+readelf -W --segments $t/exe2 | grep -E 'PHDR\s.*0x0+300000\s'
 
 
 $CC -B. -o $t/exe3 $t/a.o -no-pie \
   -Wl,--section-order='=0x200000 !ehdr_start EHDR %0x20 !rodata_start RODATA =0x300000 !phdr_start PHDR %4096 !phdr_end =0x400000 !text_start TEXT DATA BSS'
 
 readelf -sW $t/exe3 > $t/log3
-grep -Eq '\b0+200000 .* ehdr_start$' $t/log3
-grep -Eq '\b0+200040 .* rodata_start$' $t/log3
-grep -Eq '\b0+300000 .* phdr_start$' $t/log3
-grep -Eq '\b0+301000 .* phdr_end$' $t/log3
-grep -Eq '\b0+400000 .* text_start$' $t/log3
+grep -E '\b0+200000 .* ehdr_start$' $t/log3
+grep -E '\b0+200040 .* rodata_start$' $t/log3
+grep -E '\b0+300000 .* phdr_start$' $t/log3
+grep -E '\b0+301000 .* phdr_end$' $t/log3
+grep -E '\b0+400000 .* text_start$' $t/log3
diff --git a/test/section-start.sh b/test/section-start.sh
index e218a6c4dc..269f7d8d5a 100755
--- a/test/section-start.sh
+++ b/test/section-start.sh
@@ -26,7 +26,7 @@ EOF
 
 $CC -B. -o $t/exe1 $t/a.o -no-pie \
   -Wl,--section-start=.fn1=0x10000000,--section-start=.fn2=0x20000000
-$QEMU $t/exe1 | grep -q 'main fn1 fn2 0x10000000 0x20000000'
+$QEMU $t/exe1 | grep 'main fn1 fn2 0x10000000 0x20000000'
 
 # PT_LOAD must be sorted on p_vaddr
 readelf -W --segments $t/exe1 | grep ' LOAD ' | sed 's/0x[0-9a-f]*//' > $t/log1
@@ -34,7 +34,7 @@ diff $t/log1 <(sort $t/log1)
 
 $CC -B. -o $t/exe2 $t/a.o -no-pie \
   -Wl,--section-start=.fn1=0x20000000,--section-start=.fn2=0x10000000
-$QEMU $t/exe2 | grep -q 'main fn1 fn2 0x20000000 0x10000000'
+$QEMU $t/exe2 | grep 'main fn1 fn2 0x20000000 0x10000000'
 
 readelf -W --segments $t/exe2 | grep ' LOAD ' | sed 's/0x[0-9a-f]*//' > $t/log2
 diff $t/log2 <(sort $t/log2)
diff --git a/test/separate-debug-file.sh b/test/separate-debug-file.sh
index 7430c94e9b..c7dcb93b85 100755
--- a/test/separate-debug-file.sh
+++ b/test/separate-debug-file.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 . $(dirname $0)/common.inc
 
-nm mold | grep -q '__tsan_init' && skip
+nm mold | grep '__tsan_init' && skip
 on_qemu && skip
 command -v gdb >& /dev/null || skip
 command -v flock >& /dev/null || skip
@@ -15,14 +15,24 @@ EOF
 
 $CC -c -o $t/a.o $t/a.c -g
 $CC -B. -o $t/exe1 $t/a.o -Wl,--separate-debug-file
-readelf -SW $t/exe1 | grep -Fq .gnu_debuglink
+readelf -SW $t/exe1 | grep -F .gnu_debuglink
 
 flock $t/exe1 true
-gdb $t/exe1 -ex 'list main' -ex 'quit' | grep -Fq printf
+gdb $t/exe1 -ex 'list main' -ex 'quit' | grep -F printf
+
 
 $CC -c -o $t/a.o $t/a.c -g
-$CC -B. -o $t/exe2 $t/a.o -Wl,--separate-debug-file -Wl,--no-build-id
-readelf -SW $t/exe2 | grep -Fq .gnu_debuglink
+$CC -B. -o $t/exe2 $t/a.o -Wl,--separate-debug-file,--no-build-id
+readelf -SW $t/exe2 | grep -F .gnu_debuglink
 
 flock $t/exe2 true
-gdb $t/exe2 -ex 'list main' -ex 'quit' | grep -Fq printf
+gdb $t/exe2 -ex 'list main' -ex 'quit' | grep -F printf
+
+
+$CC -c -o $t/a.o $t/a.c -g
+$CC -B. -o $t/exe3 $t/a.o -Wl,--separate-debug-file,--compress-debug-sections=zlib
+readelf -SW $t/exe3 | grep -F .gnu_debuglink
+
+flock $t/exe3 true
+readelf -W --sections $t/exe3.dbg | grep '\.debug_info .*C'
+gdb $t/exe3 -ex 'list main' -ex 'quit' | grep -F printf
diff --git a/test/shared-abs-sym.sh b/test/shared-abs-sym.sh
index c68b0c6fdd..b46866d219 100755
--- a/test/shared-abs-sym.sh
+++ b/test/shared-abs-sym.sh
@@ -22,12 +22,12 @@ EOF
 
 cp $t/a.so $t/c.so
 $CC -B. -o $t/exe1 $t/d.o $t/c.so -pie || skip
-$QEMU $t/exe1 | grep -q 'foo=0x3' || skip
+$QEMU $t/exe1 | grep 'foo=0x3' || skip
 cp $t/b.so $t/c.so
-$QEMU $t/exe1 | grep -q 'foo=0x5'
+$QEMU $t/exe1 | grep 'foo=0x5'
 
 cp $t/a.so $t/c.so
 $CC -B. -o $t/exe2 $t/d.o $t/c.so -no-pie
-$QEMU $t/exe2 | grep -q 'foo=0x3'
+$QEMU $t/exe2 | grep 'foo=0x3'
 cp $t/b.so $t/c.so
-$QEMU $t/exe1 | grep -q 'foo=0x5'
+$QEMU $t/exe1 | grep 'foo=0x5'
diff --git a/test/shared.sh b/test/shared.sh
index 63803a0f30..108674825e 100755
--- a/test/shared.sh
+++ b/test/shared.sh
@@ -11,8 +11,8 @@ $CC -B. -shared -o $t/b.so $t/a.o
 
 readelf --dyn-syms $t/b.so > $t/log
 
-grep -q '00000000     0 NOTYPE  GLOBAL DEFAULT  UND fn2' $t/log
-grep -Eq 'FUNC    GLOBAL DEFAULT .* fn1' $t/log
+grep '00000000     0 NOTYPE  GLOBAL DEFAULT  UND fn2' $t/log
+grep -E 'FUNC    GLOBAL DEFAULT .* fn1' $t/log
 
 cat <<EOF | $CC -fPIC -c -o $t/c.o -xc -
 #include <stdio.h>
@@ -30,5 +30,5 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/c.o $t/b.so
-$QEMU $t/exe | grep -q hello
-! readelf --symbols $t/exe | grep -q fn3 || false
+$QEMU $t/exe | grep hello
+readelf --symbols $t/exe | not grep fn3
diff --git a/test/shuffle-sections-seed.sh b/test/shuffle-sections-seed.sh
index 21bb7d174b..974e52f195 100755
--- a/test/shuffle-sections-seed.sh
+++ b/test/shuffle-sections-seed.sh
@@ -10,21 +10,21 @@ EOF
 
 # Create a lot of sections to lower the probability that
 # we get the identical output as a result of shuffling.
-for i in `seq 1 1000`; do echo "void fn$i() {}"; done | \
+for i in `seq 1 1000`; do echo "void fn$i() {}"; done |
   $CC -o $t/b.o -ffunction-sections -c -xc -
 
 $CC -B. -o $t/exe1 $t/a.o $t/b.o
-$QEMU $t/exe1 | grep -q 'Hello world'
+$QEMU $t/exe1 | grep 'Hello world'
 
 $CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,-shuffle-sections=42
-$QEMU $t/exe2 | grep -q 'Hello world'
+$QEMU $t/exe2 | grep 'Hello world'
 
 $CC -B. -o $t/exe3 $t/a.o $t/b.o -Wl,-shuffle-sections=42
-$QEMU $t/exe3 | grep -q 'Hello world'
+$QEMU $t/exe3 | grep 'Hello world'
 
 $CC -B. -o $t/exe4 $t/a.o $t/b.o -Wl,-shuffle-sections=5
-$QEMU $t/exe4 | grep -q 'Hello world'
+$QEMU $t/exe4 | grep 'Hello world'
 
-! diff $t/exe1 $t/exe2 >& /dev/null || false
+not diff $t/exe1 $t/exe2 >& /dev/null
 diff $t/exe2 $t/exe3
-! diff $t/exe3 $t/exe4 >& /dev/null || false
+not diff $t/exe3 $t/exe4 >& /dev/null
diff --git a/test/shuffle-sections.sh b/test/shuffle-sections.sh
index 24cb30f6e7..1fbaa6c1d9 100755
--- a/test/shuffle-sections.sh
+++ b/test/shuffle-sections.sh
@@ -10,13 +10,13 @@ EOF
 
 # Create a lot of sections to lower the probability that
 # we get the identical output as a result of shuffling.
-for i in `seq 1 1000`; do echo "void fn$i() {}"; done | \
+for i in `seq 1 1000`; do echo "void fn$i() {}"; done |
   $CC -o $t/b.o -ffunction-sections -c -xc -
 
 $CC -B. -o $t/exe1 $t/a.o $t/b.o
-$QEMU $t/exe1 | grep -q 'Hello world'
+$QEMU $t/exe1 | grep 'Hello world'
 
 $CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,-shuffle-sections
-$QEMU $t/exe2 | grep -q 'Hello world'
+$QEMU $t/exe2 | grep 'Hello world'
 
-! diff $t/exe1 $t/exe2 >& /dev/null || false
+not diff $t/exe1 $t/exe2 >& /dev/null
diff --git a/test/soname.sh b/test/soname.sh
index 6f347873b6..1f50dcab67 100755
--- a/test/soname.sh
+++ b/test/soname.sh
@@ -6,8 +6,7 @@ void foo() {}
 EOF
 
 $CC -o $t/b.so -shared $t/a.o
-readelf --dynamic $t/b.so > $t/log
-! grep -Fq 'Library soname' $t/log || false
+readelf --dynamic $t/b.so | not grep -F 'Library soname'
 
 $CC -B. -o $t/b.so -shared $t/a.o -Wl,-soname,foo
-readelf --dynamic $t/b.so | grep -Fq 'Library soname: [foo]'
+readelf --dynamic $t/b.so | grep -F 'Library soname: [foo]'
diff --git a/test/spare-program-headers.sh b/test/spare-program-headers.sh
index 782925a190..f20a8ea44f 100755
--- a/test/spare-program-headers.sh
+++ b/test/spare-program-headers.sh
@@ -9,17 +9,17 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o
-$QEMU $t/exe1 | grep -q 'Hello world'
+$QEMU $t/exe1 | grep 'Hello world'
 [ "$(readelf -Wl $t/exe1 | grep NULL | wc -l)" -eq 0 ]
 
 $CC -B. -o $t/exe2 $t/a.o -Wl,--spare-program-headers=0
-$QEMU $t/exe2 | grep -q 'Hello world'
+$QEMU $t/exe2 | grep 'Hello world'
 [ "$(readelf -Wl $t/exe2 | grep NULL | wc -l)" -eq 0 ]
 
 $CC -B. -o $t/exe3 $t/a.o -Wl,--spare-program-headers=1
-$QEMU $t/exe3 | grep -q 'Hello world'
+$QEMU $t/exe3 | grep 'Hello world'
 [ "$(readelf -Wl $t/exe3 | grep NULL | wc -l)" -eq 1 ]
 
 $CC -B. -o $t/exe4 $t/a.o -Wl,--spare-program-headers=5
-$QEMU $t/exe4 | grep -q 'Hello world'
+$QEMU $t/exe4 | grep 'Hello world'
 [ "$(readelf -Wl $t/exe4 | grep NULL | wc -l)" -eq 5 ]
diff --git a/test/start-lib.sh b/test/start-lib.sh
index 607da80e88..2e4cce9b9a 100755
--- a/test/start-lib.sh
+++ b/test/start-lib.sh
@@ -15,5 +15,5 @@ EOF
 
 $CC -B. -o $t/exe -Wl,-start-lib $t/a.o -Wl,-end-lib $t/b.o $t/c.o
 nm $t/exe > $t/log
-! grep -q ' foo$' $t/log || false
-grep -q ' bar$' $t/log
+not grep ' foo$' $t/log
+grep ' bar$' $t/log
diff --git a/test/start-stop-symbol.sh b/test/start-stop-symbol.sh
index 69788a088c..c5b22003f1 100755
--- a/test/start-stop-symbol.sh
+++ b/test/start-stop-symbol.sh
@@ -21,7 +21,7 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/c.o $t/b.a
-$QEMU $t/exe | grep -q 'section foo section foo'
+$QEMU $t/exe | grep 'section foo section foo'
 
 $CC -B. -o $t/exe $t/c.o $t/b.a -Wl,-gc-sections
-$QEMU $t/exe | grep -q 'section foo section foo'
+$QEMU $t/exe | grep 'section foo section foo'
diff --git a/test/start-stop.sh b/test/start-stop.sh
index bc0f587827..97edc736c2 100755
--- a/test/start-stop.sh
+++ b/test/start-stop.sh
@@ -6,5 +6,5 @@ int main() {}
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o -Wl,--start-stop
-readelf -sW $t/exe1 | grep -q __start_text
-readelf -sW $t/exe1 | grep -q __stop_text
+readelf -sW $t/exe1 | grep __start_text
+readelf -sW $t/exe1 | grep __stop_text
diff --git a/test/static-archive.sh b/test/static-archive.sh
index b4de1ebe57..f3cd348d34 100755
--- a/test/static-archive.sh
+++ b/test/static-archive.sh
@@ -25,8 +25,8 @@ rm -f $t/d.a
 
 $CC -B. -Wl,--trace -o $t/exe $t/c.o $t/d.a > $t/log
 
-grep -Fq 'static-archive/d.a(long-long-long-filename.o)' $t/log
-grep -Fq 'static-archive/d.a(b.o)' $t/log
-grep -Fq static-archive/c.o $t/log
+grep -F 'static-archive/d.a(long-long-long-filename.o)' $t/log
+grep -F 'static-archive/d.a(b.o)' $t/log
+grep -F static-archive/c.o $t/log
 
-$QEMU $t/exe | grep -q '8'
+$QEMU $t/exe | grep '8'
diff --git a/test/static-pie.sh b/test/static-pie.sh
index 81e3884354..0160b5cbe3 100755
--- a/test/static-pie.sh
+++ b/test/static-pie.sh
@@ -11,7 +11,7 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o -static-pie
-$QEMU $t/exe1 | grep -q 'Hello world'
+$QEMU $t/exe1 | grep 'Hello world'
 
 $CC -B. -o $t/exe2 $t/a.o -static-pie -Wl,--no-relax
-$QEMU $t/exe2 | grep -q 'Hello world'
+$QEMU $t/exe2 | grep 'Hello world'
diff --git a/test/stdout.sh b/test/stdout.sh
index 8752da0688..975e6e9dde 100755
--- a/test/stdout.sh
+++ b/test/stdout.sh
@@ -12,4 +12,4 @@ EOF
 
 $CC -B. -Wl,-build-id=sha1 $t/a.o -o - > $t/exe
 chmod 755 $t/exe
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
diff --git a/test/strip-debug.sh b/test/strip-debug.sh
index a975c72219..6bc44450df 100755
--- a/test/strip-debug.sh
+++ b/test/strip-debug.sh
@@ -9,7 +9,5 @@ EOF
 
 $CC -B. -o $t/exe $t/a.o -Wl,--strip-debug
 
-readelf -W --sections $t/exe > $t/log
-! grep -Fq .debug_info $t/log || false
-
-readelf -W --symbols $t/exe | grep -q ' bar'
+readelf -W --sections $t/exe | not grep -F .debug_info
+readelf -W --symbols $t/exe | grep ' bar'
diff --git a/test/strip.sh b/test/strip.sh
index de6b7d1e81..91bed39239 100755
--- a/test/strip.sh
+++ b/test/strip.sh
@@ -11,20 +11,20 @@ EOF
 
 ./mold -o $t/exe $t/a.o
 readelf --symbols $t/exe > $t/log
-grep -Fq _start $t/log
-grep -Fq foo $t/log
-grep -Fq bar $t/log
+grep -F _start $t/log
+grep -F foo $t/log
+grep -F bar $t/log
 
 if [[ $MACHINE != riscv* ]] && [[ $MACHINE != loongarch* ]]; then
-  grep -Fq .L.baz $t/log
+  grep -F .L.baz $t/log
 fi
 
 ./mold -o $t/exe $t/a.o -strip-all
 readelf --symbols $t/exe > $t/log
-! grep -Fq _start $t/log || false
-! grep -Fq foo $t/log || false
-! grep -Fq bar $t/log || false
+not grep -F _start $t/log
+not grep -F foo $t/log
+not grep -F bar $t/log
 
 if [[ $MACHINE != riscv* ]] && [[ $MACHINE != loongarch* ]]; then
-  ! grep -Fq .L.baz $t/log || false
+  not grep -F .L.baz $t/log
 fi
diff --git a/test/stt-common.sh b/test/stt-common.sh
index a7b4ae0c32..2e6eaa205e 100755
--- a/test/stt-common.sh
+++ b/test/stt-common.sh
@@ -20,7 +20,7 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o $t/b.o -Wl,--fatal-warnings
-$QEMU $t/exe | grep -q '0 5 42'
+$QEMU $t/exe | grep '0 5 42'
 
 readelf --sections $t/exe > $t/log
-grep -q '.common .*NOBITS' $t/log
+grep '.common .*NOBITS' $t/log
diff --git a/test/symbol-rank.sh b/test/symbol-rank.sh
index 45f316ec29..2ce0017815 100755
--- a/test/symbol-rank.sh
+++ b/test/symbol-rank.sh
@@ -39,4 +39,4 @@ rm -f $t/x.a
 ar rcs $t/x.a $t/a.o $t/b.o $t/c.o
 
 $CC -B. -o $t/exe $t/d.o $t/x.a
-$QEMU $t/exe | grep -q '^0 0 0 5$'
+$QEMU $t/exe | grep '^0 0 0 5$'
diff --git a/test/symbol-version-lto.sh b/test/symbol-version-lto.sh
index de02e456df..b5c7af22cd 100755
--- a/test/symbol-version-lto.sh
+++ b/test/symbol-version-lto.sh
@@ -12,4 +12,4 @@ echo 'VER1 { foo; };' > $t/b.ver
 $CC -B. -shared -o $t/c.so $t/a.o -Wl,--version-script=$t/b.ver -flto
 readelf --symbols $t/c.so > $t/log
 
-grep -Fq 'foo@@VER1' $t/log
+grep -F 'foo@@VER1' $t/log
diff --git a/test/symbol-version.sh b/test/symbol-version.sh
index 081ba2a5bd..541a8b3248 100755
--- a/test/symbol-version.sh
+++ b/test/symbol-version.sh
@@ -21,6 +21,6 @@ echo 'VER1 { local: *; }; VER2 { local: *; }; VER3 { local: *; };' > $t/b.ver
 $CC -B. -shared -o $t/c.so $t/a.o -Wl,--version-script=$t/b.ver
 readelf --symbols $t/c.so > $t/log
 
-grep -Fq 'foo@VER1' $t/log
-grep -Fq 'foo@VER2' $t/log
-grep -Fq 'foo@@VER3' $t/log
+grep -F 'foo@VER1' $t/log
+grep -F 'foo@VER2' $t/log
+grep -F 'foo@@VER3' $t/log
diff --git a/test/symbol-version2.sh b/test/symbol-version2.sh
index 982154e85f..fd88cf8316 100755
--- a/test/symbol-version2.sh
+++ b/test/symbol-version2.sh
@@ -16,7 +16,7 @@ EOF
 $CC -B. -o $t/c.so -shared $t/a.o -Wl,--version-script=$t/b.version
 readelf -W --dyn-syms $t/c.so > $t/log
 
-grep -q ' foo@TEST' $t/log
-grep -q ' bar@TEST' $t/log
-grep -q ' bar1' $t/log
-! grep -q ' foo@@TEST' $t/log || false
+grep ' foo@TEST' $t/log
+grep ' bar@TEST' $t/log
+grep ' bar1' $t/log
+not grep ' foo@@TEST' $t/log
diff --git a/test/symbol-version3.sh b/test/symbol-version3.sh
index 8eeae4cc7b..21b4584320 100755
--- a/test/symbol-version3.sh
+++ b/test/symbol-version3.sh
@@ -19,7 +19,7 @@ EOF
 $CC -B. -o $t/c.so -shared $t/a.o -Wl,--version-script=$t/b.version
 readelf -W --dyn-syms $t/c.so > $t/log
 
-grep -q ' foo@@TEST1' $t/log
-grep -q ' foo@TEST2' $t/log
-grep -q ' foo@TEST3' $t/log
-! grep -q ' foo$' $t/log || false
+grep ' foo@@TEST1' $t/log
+grep ' foo@TEST2' $t/log
+grep ' foo@TEST3' $t/log
+not grep ' foo$' $t/log
diff --git a/test/symbol-version4.sh b/test/symbol-version4.sh
index f6f7c0dadc..3c4a5d0874 100755
--- a/test/symbol-version4.sh
+++ b/test/symbol-version4.sh
@@ -55,4 +55,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/g.o $t/f.so $t/c.so
-$QEMU $t/exe | grep -q 'foo bar'
+$QEMU $t/exe | grep 'foo bar'
diff --git a/test/symtab-dso.sh b/test/symtab-dso.sh
index 2ccae78d52..3ef0229d3f 100755
--- a/test/symtab-dso.sh
+++ b/test/symtab-dso.sh
@@ -9,4 +9,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o
-nm $t/exe | grep -q 'U puts$'
+nm $t/exe | grep 'U puts$'
diff --git a/test/symtab-section-symbols.sh b/test/symtab-section-symbols.sh
index a53ac3a5ce..1d35b6d504 100755
--- a/test/symtab-section-symbols.sh
+++ b/test/symtab-section-symbols.sh
@@ -9,4 +9,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o
-readelf -s $t/exe | grep -q 'SECTION LOCAL  DEFAULT'
+readelf -s $t/exe | grep 'SECTION LOCAL  DEFAULT'
diff --git a/test/symtab.sh b/test/symtab.sh
index d80204e816..2605385d4a 100755
--- a/test/symtab.sh
+++ b/test/symtab.sh
@@ -24,9 +24,9 @@ echo '{ local: module_local; global: *; };' > $t/c.map
 
 readelf --symbols $t/exe > $t/log
 
-grep -Eq '0 NOTYPE  LOCAL  DEFAULT .* local1' $t/log
-grep -Eq '0 NOTYPE  LOCAL  DEFAULT .* local2' $t/log
-grep -Eq '0 NOTYPE  LOCAL  DEFAULT .* module_local' $t/log
-grep -Eq '0 NOTYPE  GLOBAL DEFAULT .* foo' $t/log
-grep -Eq '0 NOTYPE  GLOBAL DEFAULT .* bar' $t/log
-grep -Eq '0 NOTYPE  GLOBAL DEFAULT .* this_is_global' $t/log
+grep -E '0 NOTYPE  LOCAL  DEFAULT .* local1' $t/log
+grep -E '0 NOTYPE  LOCAL  DEFAULT .* local2' $t/log
+grep -E '0 NOTYPE  LOCAL  DEFAULT .* module_local' $t/log
+grep -E '0 NOTYPE  GLOBAL DEFAULT .* foo' $t/log
+grep -E '0 NOTYPE  GLOBAL DEFAULT .* bar' $t/log
+grep -E '0 NOTYPE  GLOBAL DEFAULT .* this_is_global' $t/log
diff --git a/test/synthetic-symbols.sh b/test/synthetic-symbols.sh
index 9249435bfa..1c1c162515 100755
--- a/test/synthetic-symbols.sh
+++ b/test/synthetic-symbols.sh
@@ -42,10 +42,10 @@ EOF
 $CC -B. -no-pie -Wl,--image-base=0x40000 -o $t/exe $t/a.o $t/b.o
 $QEMU $t/exe > $t/log
 
-grep -q '^__ehdr_start=0x40000$' $t/log
-grep -q '^__executable_start=0x40000$' $t/log
-grep -q '^__dso_handle=' $t/log
-grep -q '^section foo$' $t/log
+grep '^__ehdr_start=0x40000$' $t/log
+grep '^__executable_start=0x40000$' $t/log
+grep '^__dso_handle=' $t/log
+grep '^section foo$' $t/log
 
 # Make sure that synthetic symbols overwrite existing ones
 
@@ -86,9 +86,9 @@ EOF
 $CC -B. -no-pie -Wl,--image-base=0x40000 -o $t/exe $t/a.o $t/c.o
 $QEMU $t/exe > $t/log
 
-grep -q '^end=foo$' $t/log
-grep -q '^etext=foo$' $t/log
-grep -q '^edata=foo$' $t/log
-grep -q '^__ehdr_start=0x40000$' $t/log
-grep -q '^__executable_start=0x40000$' $t/log
-grep -q '^section foo$' $t/log
+grep '^end=foo$' $t/log
+grep '^etext=foo$' $t/log
+grep '^edata=foo$' $t/log
+grep '^__ehdr_start=0x40000$' $t/log
+grep '^__executable_start=0x40000$' $t/log
+grep '^section foo$' $t/log
diff --git a/test/sysroot.sh b/test/sysroot.sh
index dfa6a10c17..f5dd260728 100755
--- a/test/sysroot.sh
+++ b/test/sysroot.sh
@@ -22,19 +22,14 @@ int main() {
 }
 EOF
 
-$CC -B. -o $t/exe $t/c.o -Wl,--sysroot=$t/ \
-  -Wl,-L=foo/bar -lfoo
+$CC -B. -o $t/exe $t/c.o -Wl,--sysroot=$t/ -Wl,-L=foo/bar -lfoo
 
-$CC -B. -o $t/exe $t/c.o -Wl,--sysroot=$t/ \
-  -Wl,-L=/foo/bar -lfoo
+$CC -B. -o $t/exe $t/c.o -Wl,--sysroot=$t/ -Wl,-L=/foo/bar -lfoo
 
-$CC -B. -o $t/exe $t/c.o -Wl,--sysroot=$t/ \
-  '-Wl,-L$SYSROOTfoo/bar' -lfoo
+$CC -B. -o $t/exe $t/c.o -Wl,--sysroot=$t/ '-Wl,-L$SYSROOTfoo/bar' -lfoo
 
-$CC -B. -o $t/exe $t/c.o -Wl,--sysroot=$t/ \
-  '-Wl,-L$SYSROOT/foo/bar' -lfoo
+$CC -B. -o $t/exe $t/c.o -Wl,--sysroot=$t/ '-Wl,-L$SYSROOT/foo/bar' -lfoo
 
-! $CC -B. -o $t/exe $t/c.o -lfoo >& /dev/null
+not $CC -B. -o $t/exe $t/c.o -lfoo >& /dev/null
 
-! $CC -B. -o $t/exe $t/c.o -Wl,--sysroot=$t \
-  -Wl,-Lfoo/bar -lfoo >& /dev/null
+not $CC -B. -o $t/exe $t/c.o -Wl,--sysroot=$t -Wl,-Lfoo/bar -lfoo >& /dev/null
diff --git a/test/tail-call.sh b/test/tail-call.sh
index 0e1de04c43..dec8c883a8 100755
--- a/test/tail-call.sh
+++ b/test/tail-call.sh
@@ -21,8 +21,8 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o $t/b.o $t/c.o
-$QEMU $t/exe | grep -q '42'
+$QEMU $t/exe | grep '42'
 
 if [ $MACHINE = riscv32 -o $MACHINE = riscv64 ]; then
-  $OBJDUMP -d $t/exe | grep -q bfed # c.j pc - 6
+  $OBJDUMP -d $t/exe | grep bfed # c.j pc - 6
 fi
diff --git a/test/textrel.sh b/test/textrel.sh
new file mode 100755
index 0000000000..f757ab3665
--- /dev/null
+++ b/test/textrel.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -o $t/a.o -c -xc - -fno-PIE
+void hello();
+int main() { hello(); }
+EOF
+
+cat <<EOF | $CC -o $t/b.o -c -xc - -fno-PIE
+#include <stdio.h>
+
+__attribute__((section(".text")))
+int (*fn)(const char *s) = puts;
+
+void hello() {
+  puts("Hello world");
+}
+EOF
+
+$CC -B. -o $t/exe $t/a.o $t/b.o -no-pie
+$QEMU $t/exe | grep 'Hello world'
diff --git a/test/textrel2.sh b/test/textrel2.sh
new file mode 100755
index 0000000000..f242368758
--- /dev/null
+++ b/test/textrel2.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -o $t/a.o -c -xc - -fPIE
+#include <stdio.h>
+
+void hello() {
+  puts("Hello world");
+}
+
+__attribute__((section(".text")))
+void (*p)() = hello;
+
+int main() {
+  p();
+}
+EOF
+
+$CC -o $t/exe1 $t/a.o -pie
+$QEMU $t/exe1 | grep 'Hello world' || skip
+
+$CC -B. -o $t/exe2 $t/a.o -pie
+$QEMU $t/exe2 | grep 'Hello world'
+
+$CC -o $t/exe3 $t/a.o -pie -Wl,-z,pack-relative-relocs 2> /dev/null || skip
+readelf -WS $t/exe3 | grep -F .relr.dyn || skip
+$QEMU $t/exe3 2> /dev/null | grep 'Hello world' || skip
+
+$CC -B. -o $t/exe4 $t/a.o -pie -Wl,-z,pack-relative-relocs
+$QEMU $t/exe4 | grep 'Hello world'
diff --git a/test/thin-archive.sh b/test/thin-archive.sh
index a1e7d60cb0..bd1d7bdd15 100755
--- a/test/thin-archive.sh
+++ b/test/thin-archive.sh
@@ -30,8 +30,8 @@ rm -f $t/d.a
 
 $CC -B. -Wl,--trace -o $t/exe $t/d.o $t/d.a > $t/log
 
-grep -Eq 'thin-archive/d.a\(.*long-long-long-filename.o\)' $t/log
-grep -Eq 'thin-archive/d.a\((.*/)?b.o\)' $t/log
-grep -Fq thin-archive/d.o $t/log
+grep -E 'thin-archive/d.a\(.*long-long-long-filename.o\)' $t/log
+grep -E 'thin-archive/d.a\((.*/)?b.o\)' $t/log
+grep -F thin-archive/d.o $t/log
 
-$QEMU $t/exe | grep -q 15
+$QEMU $t/exe | grep 15
diff --git a/test/tls-common.sh b/test/tls-common.sh
index 8f475914c1..3019a7a9a5 100755
--- a/test/tls-common.sh
+++ b/test/tls-common.sh
@@ -17,5 +17,5 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o $t/b.o
-readelf -WS $t/exe | grep -Fq .tls_common
-$QEMU $t/exe | grep -q '^foo=0$'
+readelf -WS $t/exe | grep -F .tls_common
+$QEMU $t/exe | grep '^foo=0$'
diff --git a/test/tls-df-static-tls.sh b/test/tls-df-static-tls.sh
index efba6aaddb..9c32dc8d43 100755
--- a/test/tls-df-static-tls.sh
+++ b/test/tls-df-static-tls.sh
@@ -8,7 +8,7 @@ int bar() { return foo; }
 EOF
 
 $CC -B. -shared -o $t/b.so $t/a.o -Wl,--relax
-readelf --dynamic $t/b.so | grep -q STATIC_TLS
+readelf --dynamic $t/b.so | grep STATIC_TLS
 
 $CC -B. -shared -o $t/c.so $t/a.o -Wl,--no-relax
-readelf --dynamic $t/c.so | grep -q STATIC_TLS
+readelf --dynamic $t/c.so | grep STATIC_TLS
diff --git a/test/tls-dso.sh b/test/tls-dso.sh
index a045ad5f2d..ee788e59e6 100755
--- a/test/tls-dso.sh
+++ b/test/tls-dso.sh
@@ -33,4 +33,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.so $t/b.o
-$QEMU $t/exe | grep -q '5 3 5 3 5 3'
+$QEMU $t/exe | grep '5 3 5 3 5 3'
diff --git a/test/tls-gd-dlopen.sh b/test/tls-gd-dlopen.sh
index e269b1f73f..c1c739219d 100755
--- a/test/tls-gd-dlopen.sh
+++ b/test/tls-gd-dlopen.sh
@@ -30,4 +30,4 @@ int main(int argc, char **argv) {
 EOF
 
 $CC -B. -o $t/exe $t/c.o -ldl
-$QEMU $t/exe $t/b.so | grep -q '3 0 5'
+$QEMU $t/exe $t/b.so | grep '3 0 5'
diff --git a/test/tls-gd-noplt.sh b/test/tls-gd-noplt.sh
index 5dfb74bc16..32f8f1f7c0 100755
--- a/test/tls-gd-noplt.sh
+++ b/test/tls-gd-noplt.sh
@@ -35,7 +35,7 @@ $CC -B. -shared -o $t/d.so $t/b.o
 $CC -B. -shared -o $t/e.so $t/c.o -Wl,--no-relax
 
 $CC -B. -o $t/exe $t/a.o $t/d.so $t/e.so
-$QEMU $t/exe | grep -q '1 2 3 4 5 6'
+$QEMU $t/exe | grep '1 2 3 4 5 6'
 
 $CC -B. -o $t/exe $t/a.o $t/d.so $t/e.so -Wl,-no-relax
-$QEMU $t/exe | grep -q '1 2 3 4 5 6'
+$QEMU $t/exe | grep '1 2 3 4 5 6'
diff --git a/test/tls-gd-to-ie.sh b/test/tls-gd-to-ie.sh
index 2851547ff4..c92d8d1d09 100755
--- a/test/tls-gd-to-ie.sh
+++ b/test/tls-gd-to-ie.sh
@@ -23,16 +23,16 @@ EOF
 
 $CC -B. -shared -o $t/c.so $t/a.o
 $CC -B. -o $t/exe1 $t/b.o $t/c.so
-$QEMU $t/exe1 | grep -q '1 2 3'
+$QEMU $t/exe1 | grep '1 2 3'
 
 $CC -B. -shared -o $t/d.so $t/a.o -Wl,-no-relax
 $CC -B. -o $t/exe2 $t/b.o $t/d.so
-$QEMU $t/exe2 | grep -q '1 2 3'
+$QEMU $t/exe2 | grep '1 2 3'
 
 $CC -B. -shared -o $t/e.so $t/a.o -Wl,-z,nodlopen
 $CC -B. -o $t/exe3 $t/b.o $t/e.so
-$QEMU $t/exe3 | grep -q '1 2 3'
+$QEMU $t/exe3 | grep '1 2 3'
 
 $CC -B. -shared -o $t/f.so $t/a.o -Wl,-z,nodlopen -Wl,-no-relax
 $CC -B. -o $t/exe4 $t/b.o $t/f.so
-$QEMU $t/exe4 | grep -q '1 2 3'
+$QEMU $t/exe4 | grep '1 2 3'
diff --git a/test/tls-gd.sh b/test/tls-gd.sh
index 7d0eb7762a..737f9b34ed 100755
--- a/test/tls-gd.sh
+++ b/test/tls-gd.sh
@@ -36,15 +36,15 @@ $CC -B. -shared -o $t/d.so $t/b.o
 $CC -B. -shared -o $t/e.so $t/c.o -Wl,--no-relax
 
 $CC -B. -o $t/exe1 $t/a.o $t/d.so $t/e.so
-$QEMU $t/exe1 | grep -q '1 2 3 4 5 6'
+$QEMU $t/exe1 | grep '1 2 3 4 5 6'
 
 $CC -B. -o $t/exe2 $t/a.o $t/d.so $t/e.so -Wl,-no-relax
-$QEMU $t/exe2 | grep -q '1 2 3 4 5 6'
+$QEMU $t/exe2 | grep '1 2 3 4 5 6'
 
 if test_cflags -static; then
   $CC -B. -o $t/exe3 $t/a.o $t/b.o $t/c.o -static
-  $QEMU $t/exe3 | grep -q '1 2 3 4 5 6'
+  $QEMU $t/exe3 | grep '1 2 3 4 5 6'
 
   $CC -B. -o $t/exe4 $t/a.o $t/b.o $t/c.o -static -Wl,-no-relax
-  $QEMU $t/exe4 | grep -q '1 2 3 4 5 6'
+  $QEMU $t/exe4 | grep '1 2 3 4 5 6'
 fi
diff --git a/test/tls-ie.sh b/test/tls-ie.sh
index b70463203f..422bc00b32 100755
--- a/test/tls-ie.sh
+++ b/test/tls-ie.sh
@@ -37,7 +37,7 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/b.so $t/c.o
-$QEMU $t/exe | grep -q '^0 0 3 5 7$'
+$QEMU $t/exe | grep '^0 0 3 5 7$'
 
 $CC -B. -o $t/exe $t/b.so $t/c.o -Wl,-no-relax
-$QEMU $t/exe | grep -q '^0 0 3 5 7$'
+$QEMU $t/exe | grep '^0 0 3 5 7$'
diff --git a/test/tls-irregular-start-addr.sh b/test/tls-irregular-start-addr.sh
index dde3d4fe03..0e1f2deb0b 100755
--- a/test/tls-irregular-start-addr.sh
+++ b/test/tls-irregular-start-addr.sh
@@ -12,13 +12,13 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o -pie -Wl,-section-start=.tdata=0x100001 -Wl,-relax
-$QEMU $t/exe1 | grep -q '^42$'
+$QEMU $t/exe1 | grep '^42$'
 
 $CC -B. -o $t/exe2 $t/a.o -pie -Wl,-section-start=.tdata=0x100001 -Wl,-no-relax
-$QEMU $t/exe2 | grep -q '^42$'
+$QEMU $t/exe2 | grep '^42$'
 
 $CC -B. -o $t/exe3 $t/a.o -pie -Wl,-section-start=.tdata=0x10000f -Wl,-relax
-$QEMU $t/exe3 | grep -q '^42$'
+$QEMU $t/exe3 | grep '^42$'
 
 $CC -B. -o $t/exe4 $t/a.o -pie -Wl,-section-start=.tdata=0x10000f -Wl,-no-relax
-$QEMU $t/exe4 | grep -q '^42$'
+$QEMU $t/exe4 | grep '^42$'
diff --git a/test/tls-large-alignment.sh b/test/tls-large-alignment.sh
index 284cc791e7..a88ac32da1 100755
--- a/test/tls-large-alignment.sh
+++ b/test/tls-large-alignment.sh
@@ -25,7 +25,7 @@ EOF
 $CC -B. -shared -o $t/d.so $t/a.o $t/b.o
 
 $CC -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o
-$QEMU $t/exe1 | grep -q '^42 1 2 3$'
+$QEMU $t/exe1 | grep '^42 1 2 3$'
 
 $CC -B. -o $t/exe2 $t/c.o $t/d.so
-$QEMU $t/exe2 | grep -q '^42 1 2 3$'
+$QEMU $t/exe2 | grep '^42 1 2 3$'
diff --git a/test/tls-large-static-image.sh b/test/tls-large-static-image.sh
index f253d0af45..154d7152f1 100755
--- a/test/tls-large-static-image.sh
+++ b/test/tls-large-static-image.sh
@@ -15,4 +15,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o $t/b.o
-$QEMU $t/exe | grep -q '^1 2 3 0 5$'
+$QEMU $t/exe | grep '^1 2 3 0 5$'
diff --git a/test/tls-ld-noplt.sh b/test/tls-ld-noplt.sh
index 07c8c3fd97..05605ee9be 100755
--- a/test/tls-ld-noplt.sh
+++ b/test/tls-ld-noplt.sh
@@ -23,7 +23,7 @@ _Thread_local int foo = 3;
 EOF
 
 $CC -B. -o $t/exe $t/a.o $t/b.o
-$QEMU $t/exe | grep -q '3 5 3 5'
+$QEMU $t/exe | grep '3 5 3 5'
 
 $CC -B. -o $t/exe $t/a.o $t/b.o -Wl,-no-relax
-$QEMU $t/exe | grep -q '3 5 3 5'
+$QEMU $t/exe | grep '3 5 3 5'
diff --git a/test/tls-ld.sh b/test/tls-ld.sh
index 88e452028d..f6e2245de4 100755
--- a/test/tls-ld.sh
+++ b/test/tls-ld.sh
@@ -23,7 +23,7 @@ _Thread_local int foo = 3;
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o $t/b.o -Wl,-relax
-$QEMU $t/exe1 | grep -q '3 5 3 5'
+$QEMU $t/exe1 | grep '3 5 3 5'
 
 $CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,-no-relax
-$QEMU $t/exe2 | grep -q '3 5 3 5'
+$QEMU $t/exe2 | grep '3 5 3 5'
diff --git a/test/tls-le-error.sh b/test/tls-le-error.sh
index a16cb8af23..6b87eb70d1 100755
--- a/test/tls-le-error.sh
+++ b/test/tls-le-error.sh
@@ -6,5 +6,5 @@ __attribute__((tls_model("local-exec"))) static _Thread_local int foo = 5;
 int bar() { return foo; }
 EOF
 
-! $CC -B. -shared -o $t/b.so $t/a.o >& $t/log
-grep -q 'relocation .* against `foo` can not be used when making a shared object; recompile with -fPIC' $t/log
+not $CC -B. -shared -o $t/b.so $t/a.o |&
+  grep 'relocation .* against `foo` can not be used when making a shared object; recompile with -fPIC'
diff --git a/test/tls-le.sh b/test/tls-le.sh
index 502c73b06b..b5724d8b91 100755
--- a/test/tls-le.sh
+++ b/test/tls-le.sh
@@ -23,7 +23,7 @@ __attribute__((tls_model("local-exec"))) _Thread_local int foo = 3;
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o $t/b.o
-$QEMU $t/exe1 | grep -q '3 5 3 5'
+$QEMU $t/exe1 | grep '3 5 3 5'
 
 $CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,-no-relax
-$QEMU $t/exe2 | grep -q '3 5 3 5'
+$QEMU $t/exe2 | grep '3 5 3 5'
diff --git a/test/tls-nopic.sh b/test/tls-nopic.sh
index 9f189cea93..a92c2789a2 100755
--- a/test/tls-nopic.sh
+++ b/test/tls-nopic.sh
@@ -24,4 +24,4 @@ __attribute__((tls_model("global-dynamic"))) _Thread_local int foo;
 EOF
 
 $CC -B. -o $t/exe $t/a.o $t/b.o -no-pie
-$QEMU $t/exe | grep -q '3 5 3 5'
+$QEMU $t/exe | grep '3 5 3 5'
diff --git a/test/tls-pic.sh b/test/tls-pic.sh
index e0774ce4b4..7f7dc065c4 100755
--- a/test/tls-pic.sh
+++ b/test/tls-pic.sh
@@ -23,4 +23,4 @@ __attribute__((tls_model("global-dynamic"))) _Thread_local int foo = 3;
 EOF
 
 $CC -B. -o $t/exe $t/a.o $t/b.o
-$QEMU $t/exe | grep -q '3 5 3 5'
+$QEMU $t/exe | grep '3 5 3 5'
diff --git a/test/tls-small-alignment.sh b/test/tls-small-alignment.sh
index 1d334cd9bf..1992b5bad9 100755
--- a/test/tls-small-alignment.sh
+++ b/test/tls-small-alignment.sh
@@ -23,7 +23,7 @@ EOF
 $CC -B. -shared -o $t/d.so $t/a.o $t/b.o
 
 $CC -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o
-$QEMU $t/exe1 | grep -q '^42$'
+$QEMU $t/exe1 | grep '^42$'
 
 $CC -B. -o $t/exe2 $t/c.o $t/d.so
-$QEMU $t/exe2 | grep -q '^42$'
+$QEMU $t/exe2 | grep '^42$'
diff --git a/test/tlsdesc-dlopen.sh b/test/tlsdesc-dlopen.sh
index 70bfa14403..3ac1d60e8a 100755
--- a/test/tlsdesc-dlopen.sh
+++ b/test/tlsdesc-dlopen.sh
@@ -32,4 +32,4 @@ int main(int argc, char **argv) {
 EOF
 
 $CC -B. -o $t/exe $t/c.o -ldl
-$QEMU $t/exe $t/b.so | grep -q '3 0 5'
+$QEMU $t/exe $t/b.so | grep '3 0 5'
diff --git a/test/tlsdesc-import.sh b/test/tlsdesc-import.sh
index e2872b27af..7e89e5fb4f 100755
--- a/test/tlsdesc-import.sh
+++ b/test/tlsdesc-import.sh
@@ -21,4 +21,4 @@ _Thread_local int bar;
 EOF
 
 $CC -B. -o $t/exe $t/a.o $t/b.so
-$QEMU $t/exe | grep -q '5 7'
+$QEMU $t/exe | grep '5 7'
diff --git a/test/tlsdesc-initial-exec.sh b/test/tlsdesc-initial-exec.sh
index aabcdae53e..3475d137cf 100755
--- a/test/tlsdesc-initial-exec.sh
+++ b/test/tlsdesc-initial-exec.sh
@@ -27,13 +27,9 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe1 $t/c.o $t/d.o $t/b.so
-$QEMU $t/exe1 | grep -q '^5 5 5$'
-
-$OBJDUMP --dynamic-reloc $t/exe1 > $t/log1
-! grep -Eq 'TLS_?DESC' $t/log1 || false
+$QEMU $t/exe1 | grep '^5 5 5$'
+$OBJDUMP --dynamic-reloc $t/exe1 | not grep -E 'TLS_?DESC'
 
 $CC -B. -o $t/exe2 $t/c.o $t/d.o $t/b.so -Wl,--no-relax
-$QEMU $t/exe2 | grep -q '^5 5 5$'
-
-$OBJDUMP --dynamic-reloc $t/exe2 > $t/log2
-grep -Eq 'TLS_?DESC' $t/log2
+$QEMU $t/exe2 | grep '^5 5 5$'
+$OBJDUMP --dynamic-reloc $t/exe2 | grep -E 'TLS_?DESC'
diff --git a/test/tlsdesc-local-dynamic.sh b/test/tlsdesc-local-dynamic.sh
index 01d2d0fb18..7de5398393 100755
--- a/test/tlsdesc-local-dynamic.sh
+++ b/test/tlsdesc-local-dynamic.sh
@@ -33,7 +33,7 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o $t/b.o
-$QEMU $t/exe1 | grep -q '42 5'
+$QEMU $t/exe1 | grep '42 5'
 
 $CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,--no-relax
-$QEMU $t/exe2 | grep -q '42 5'
+$QEMU $t/exe2 | grep '42 5'
diff --git a/test/tlsdesc-static.sh b/test/tlsdesc-static.sh
index 41c89cb109..386bb780a1 100755
--- a/test/tlsdesc-static.sh
+++ b/test/tlsdesc-static.sh
@@ -20,7 +20,7 @@ _Thread_local int foo;
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o $t/b.o -static
-$QEMU $t/exe1 | grep -q 42
+$QEMU $t/exe1 | grep 42
 
 $CC -B. -o $t/exe2 $t/a.o $t/b.o -static -Wl,-no-relax
-$QEMU $t/exe2 | grep -q 42
+$QEMU $t/exe2 | grep 42
diff --git a/test/tlsdesc.sh b/test/tlsdesc.sh
index dd743671c7..78cf0fdc87 100755
--- a/test/tlsdesc.sh
+++ b/test/tlsdesc.sh
@@ -35,15 +35,15 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o $t/b.o
-$QEMU $t/exe1 | grep -q '42 3 5'
+$QEMU $t/exe1 | grep '42 3 5'
 
 $CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,-no-relax
-$QEMU $t/exe2 | grep -q '42 3 5'
+$QEMU $t/exe2 | grep '42 3 5'
 
 $CC -B. -shared -o $t/c.so $t/a.o
 $CC -B. -o $t/exe3 $t/b.o $t/c.so
-$QEMU $t/exe3 | grep -q '42 3 5'
+$QEMU $t/exe3 | grep '42 3 5'
 
 $CC -B. -shared -o $t/c.so $t/a.o -Wl,-no-relax
 $CC -B. -o $t/exe4 $t/b.o $t/c.so -Wl,-no-relax
-$QEMU $t/exe4 | grep -q '42 3 5'
+$QEMU $t/exe4 | grep '42 3 5'
diff --git a/test/trace-symbol.sh b/test/trace-symbol.sh
index 752d028374..eaa1c2fc6f 100755
--- a/test/trace-symbol.sh
+++ b/test/trace-symbol.sh
@@ -31,6 +31,6 @@ EOF
 $CC -B. -o $t/exe $t/a.o $t/b.o $t/c.so \
   -Wl,-y,foo -Wl,--trace-symbol=baz > $t/log
 
-grep -q 'trace-symbol: .*/a.o: reference to foo' $t/log
-grep -q 'trace-symbol: .*/b.o: definition of foo' $t/log
-grep -q 'trace-symbol: .*/c.so: definition of baz' $t/log
+grep 'trace-symbol: .*/a.o: reference to foo' $t/log
+grep 'trace-symbol: .*/b.o: definition of foo' $t/log
+grep 'trace-symbol: .*/c.so: definition of baz' $t/log
diff --git a/test/trace.sh b/test/trace.sh
index 2b8da462f1..959fabeac3 100755
--- a/test/trace.sh
+++ b/test/trace.sh
@@ -11,4 +11,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o -Wl,-trace > $t/log
-grep -q '/a\.o$' $t/log
+grep '/a\.o$' $t/log
diff --git a/test/undefined-glob-gc-sections.sh b/test/undefined-glob-gc-sections.sh
index e18baec2b4..a910d736d7 100755
--- a/test/undefined-glob-gc-sections.sh
+++ b/test/undefined-glob-gc-sections.sh
@@ -24,6 +24,6 @@ EOF
 
 $CC -B. -o $t/exe2 $t/d.a $t/e.o -Wl,--undefined-glob='foo*' -Wl,--gc-sections
 readelf -W --symbols $t/exe2 > $t/log2
-grep -q foo $t/log2
-grep -q foobar $t/log2
-! grep -q baz $t/log2 || false
+grep foo $t/log2
+grep foobar $t/log2
+not grep baz $t/log2
diff --git a/test/undefined-glob.sh b/test/undefined-glob.sh
index ac5c775b99..053288d865 100755
--- a/test/undefined-glob.sh
+++ b/test/undefined-glob.sh
@@ -24,12 +24,12 @@ EOF
 
 $CC -B. -o $t/exe1 $t/d.a $t/e.o
 readelf -W --symbols $t/exe1 > $t/log1
-! grep -q foo $t/log1 || false
-! grep -q foobar $t/log1 || false
-! grep -q baz $t/log1 || false
+not grep foo $t/log1
+not grep foobar $t/log1
+not grep baz $t/log1
 
 $CC -B. -o $t/exe2 $t/d.a $t/e.o -Wl,--undefined-glob='foo*'
 readelf -W --symbols $t/exe2 > $t/log2
-grep -q foo $t/log2
-grep -q foobar $t/log2
-! grep -q baz $t/log2 || false
+grep foo $t/log2
+grep foobar $t/log2
+not grep baz $t/log2
diff --git a/test/undefined.sh b/test/undefined.sh
index cd66095d75..099fe08e1a 100755
--- a/test/undefined.sh
+++ b/test/undefined.sh
@@ -23,15 +23,15 @@ ar cr $t/d.a $t/b.o $t/c.o
 
 ./mold -static -o $t/exe $t/a.o $t/d.a
 readelf --symbols $t/exe > $t/log
-! grep -q foo $t/log || false
-! grep -q bar $t/log || false
+not grep foo $t/log
+not grep bar $t/log
 
 ./mold -static -o $t/exe $t/a.o $t/d.a -u foo
 readelf --symbols $t/exe > $t/log
-grep -q foo $t/log
-! grep -q bar $t/log || false
+grep foo $t/log
+not grep bar $t/log
 
 ./mold -static -o $t/exe $t/a.o $t/d.a -u foo --undefined=bar
 readelf --symbols $t/exe > $t/log
-grep -q foo $t/log
-grep -q bar $t/log
+grep foo $t/log
+grep bar $t/log
diff --git a/test/undefined2.sh b/test/undefined2.sh
index 84702b5848..0acab2ad1b 100755
--- a/test/undefined2.sh
+++ b/test/undefined2.sh
@@ -23,5 +23,5 @@ ar cr $t/d.a $t/b.o $t/c.o
 
 ./mold -static -o $t/exe $t/a.o $t/d.a -undefined foo
 readelf --symbols $t/exe > $t/log
-grep -q foo $t/log
-! grep -q ndefined $t/log || false
+grep foo $t/log
+not grep ndefined $t/log
diff --git a/test/unkown-section-type.sh b/test/unkown-section-type.sh
index 52c340222e..8f4833bdc8 100755
--- a/test/unkown-section-type.sh
+++ b/test/unkown-section-type.sh
@@ -5,5 +5,5 @@ cat <<EOF | $CC -o $t/a.o -c -xassembler - 2> /dev/null || skip
 .section .my_section,"a",@0x80000000
 EOF
 
-! $CC -B. -o $t/exe $t/a.o >& $t/log1
-grep -q 'unsupported section type: 0x80000000' $t/log1
+not $CC -B. -o $t/exe $t/a.o |&
+  grep 'unsupported section type: 0x80000000'
diff --git a/test/unresolved-symbols.sh b/test/unresolved-symbols.sh
index 274697c936..4508053d4b 100755
--- a/test/unresolved-symbols.sh
+++ b/test/unresolved-symbols.sh
@@ -6,20 +6,20 @@ int foo();
 int main() { foo(); }
 EOF
 
-! $CC -B. -o $t/exe $t/a.o 2>&1 | grep -q 'undefined.*foo'
+not $CC -B. -o $t/exe $t/a.o |& grep 'undefined.*foo'
 
-! $CC -B. -o $t/exe $t/a.o -Wl,-unresolved-symbols=report-all 2>&1 \
-  | grep -q 'undefined.*foo'
+not $CC -B. -o $t/exe $t/a.o -Wl,-unresolved-symbols=report-all |&
+  grep 'undefined.*foo'
 
 $CC -B. -o $t/exe $t/a.o -Wl,-unresolved-symbols=ignore-all
 
-! readelf --dyn-syms $t/exe | grep -w foo || false
+readelf --dyn-syms $t/exe | not grep -w foo
 
 $CC -B. -o $t/exe $t/a.o -Wl,-unresolved-symbols=report-all \
-  -Wl,--warn-unresolved-symbols 2>&1 | grep -q 'undefined.*foo'
+  -Wl,--warn-unresolved-symbols |& grep 'undefined.*foo'
 
-! $CC -B. -o $t/exe $t/a.o -Wl,-unresolved-symbols=ignore-in-object-files 2>&1 \
-  | grep -q 'undefined.*foo'
+$CC -B. -o $t/exe $t/a.o -Wl,-unresolved-symbols=ignore-in-object-files |&
+  not grep 'undefined.*foo'
 
-! $CC -B. -o $t/exe $t/a.o -Wl,-unresolved-symbols=ignore-in-shared-libs 2>&1 \
-  | grep -q 'undefined.*foo'
+not $CC -B. -o $t/exe $t/a.o -Wl,-unresolved-symbols=ignore-in-shared-libs |&
+  grep 'undefined.*foo'
diff --git a/test/unresolved-symbols2.sh b/test/unresolved-symbols2.sh
index 566f50653e..6a9dfa6a2e 100755
--- a/test/unresolved-symbols2.sh
+++ b/test/unresolved-symbols2.sh
@@ -7,4 +7,4 @@ int bar() { foo(); }
 EOF
 
 $CC -B. -shared -o $t/b.so $t/a.o -Wl,-z,defs -Wl,--unresolved-symbols,ignore-in-object-files
-readelf -W --dyn-syms $t/b.so | grep -q ' UND foo$'
+readelf -W --dyn-syms $t/b.so | grep ' UND foo$'
diff --git a/test/version-script-search-paths.sh b/test/version-script-search-paths.sh
index 86da27fb0a..f21f530c7b 100755
--- a/test/version-script-search-paths.sh
+++ b/test/version-script-search-paths.sh
@@ -17,4 +17,4 @@ EOF
 $CC -B. -shared -o $t/c.so -Wl,-L$t/foo/bar -Wl,-version-script,a.ver $t/b.s
 readelf --version-info $t/c.so > $t/log
 
-grep -Fq 'Rev: 1  Flags: none  Index: 2  Cnt: 1  Name: ver_x' $t/log
+grep -F 'Rev: 1  Flags: none  Index: 2  Cnt: 1  Name: ver_x' $t/log
diff --git a/test/version-script.sh b/test/version-script.sh
index 83d71aceae..cc1890859e 100755
--- a/test/version-script.sh
+++ b/test/version-script.sh
@@ -16,4 +16,5 @@ EOF
 $CC -B. -shared -o $t/c.so -Wl,-version-script,$t/a.ver $t/b.s
 readelf --version-info $t/c.so > $t/log
 
-grep -Fq 'Rev: 1  Flags: none  Index: 2  Cnt: 1  Name: ver_x' $t/log
+grep -F 'Rev: 1  Flags: BASE  Index: 1  Cnt: 1  Name: c.so' $t/log
+grep -F 'Rev: 1  Flags: none  Index: 2  Cnt: 1  Name: ver_x' $t/log
diff --git a/test/version-script10.sh b/test/version-script10.sh
index 5576dc0a0f..c3cdb7f154 100755
--- a/test/version-script10.sh
+++ b/test/version-script10.sh
@@ -15,6 +15,6 @@ EOF
 
 $CC -B. -shared -o $t/c.so -Wl,-version-script,$t/a.ver $t/b.s
 readelf --dyn-syms $t/c.so > $t/log
-grep -q ' foo1@@VER1$' $t/log
-grep -q ' foo2@@VER1$' $t/log
-! grep -q ' foo3@@VER1$' $t/log || false
+grep ' foo1@@VER1$' $t/log
+grep ' foo2@@VER1$' $t/log
+not grep ' foo3@@VER1$' $t/log
diff --git a/test/version-script11.sh b/test/version-script11.sh
index efc0fc8fd5..b85677df2e 100755
--- a/test/version-script11.sh
+++ b/test/version-script11.sh
@@ -14,6 +14,6 @@ EOF
 $CC -B. -shared -Wl,--version-script=$t/a.ver -o $t/c.so $t/b.o
 
 readelf --dyn-syms $t/c.so > $t/log
-grep -q 'foo@@VER_X1' $t/log
-! grep -q ' bar' $t/log || false
-! grep -q ' baz' $t/log || false
+grep 'foo@@VER_X1' $t/log
+not grep ' bar' $t/log
+not grep ' baz' $t/log
diff --git a/test/version-script12.sh b/test/version-script12.sh
index 1910b126d9..79abb02a4d 100755
--- a/test/version-script12.sh
+++ b/test/version-script12.sh
@@ -20,6 +20,6 @@ EOF
 $CC -B. -shared -Wl,--version-script=$t/a.ver -o $t/c.so $t/b.o
 
 readelf --dyn-syms $t/c.so > $t/log
-grep -q ' xyz' $t/log
-grep -q ' foo_bar' $t/log
-! grep -q ' foo$' $t/log || false
+grep ' xyz' $t/log
+grep ' foo_bar' $t/log
+not grep ' foo$' $t/log
diff --git a/test/version-script13.sh b/test/version-script13.sh
index 85eba0359e..5ab67e26fa 100755
--- a/test/version-script13.sh
+++ b/test/version-script13.sh
@@ -16,5 +16,5 @@ EOF
 $CC -B. -shared -Wl,--version-script=$t/a.ver -o $t/c.so $t/b.o
 
 readelf --dyn-syms $t/c.so > $t/log
-grep -q ' foobar' $t/log
-! grep -q ' foo$' $t/log || false
+grep ' foobar' $t/log
+not grep ' foo$' $t/log
diff --git a/test/version-script14.sh b/test/version-script14.sh
index b748667ae9..b2c9a2df2d 100755
--- a/test/version-script14.sh
+++ b/test/version-script14.sh
@@ -23,9 +23,9 @@ EOF
 $CC -B. -shared -Wl,--version-script=$t/a.ver -o $t/c.so $t/b.o
 
 readelf --dyn-syms $t/c.so > $t/log
-grep -q ' xyz' $t/log
-! grep -q ' foobarzx' $t/log || false
-grep -q ' foobarcx' $t/log
-grep -q ' foo123bar456bx' $t/log
-! grep -q ' foo123bar456c' $t/log || false
-! grep -q ' foo123bar456x' $t/log || false
+grep ' xyz' $t/log
+not grep ' foobarzx' $t/log
+grep ' foobarcx' $t/log
+grep ' foo123bar456bx' $t/log
+not grep ' foo123bar456c' $t/log
+not grep ' foo123bar456x' $t/log
diff --git a/test/version-script15.sh b/test/version-script15.sh
index 4af8f45f34..d27b1574d2 100755
--- a/test/version-script15.sh
+++ b/test/version-script15.sh
@@ -20,7 +20,7 @@ EOF
 $CC -B. -shared -Wl,--version-script=$t/a.ver -o $t/c.so $t/b.o
 
 readelf --dyn-syms $t/c.so > $t/log
-grep -q ' azZ' $t/log
-grep -q ' czZ' $t/log
-! grep -q ' azC' $t/log || false
-! grep -q ' aaZ' $t/log || false
+grep ' azZ' $t/log
+grep ' czZ' $t/log
+not grep ' azC' $t/log
+not grep ' aaZ' $t/log
diff --git a/test/version-script16.sh b/test/version-script16.sh
index cd84d3c739..c3e15a0e1e 100755
--- a/test/version-script16.sh
+++ b/test/version-script16.sh
@@ -10,4 +10,4 @@ void foobar() {}
 EOF
 
 $CC -B. -shared -Wl,--version-script=$t/a.ver -o $t/c.so $t/b.o
-readelf --dyn-syms $t/c.so | grep -q foobar
+readelf --dyn-syms $t/c.so | grep foobar
diff --git a/test/version-script17.sh b/test/version-script17.sh
index 643e5aaddc..7c78489c94 100755
--- a/test/version-script17.sh
+++ b/test/version-script17.sh
@@ -13,20 +13,18 @@ void bar() {}
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o $t/b.so
-nm -g $t/exe1 | grep -q foo
+nm -g $t/exe1 | grep foo
 
 cat <<'EOF' > $t/c.ver
 { local: *; global: xyz; };
 EOF
 
 $CC -B. -o $t/exe2 $t/a.o $t/b.so -Wl,--version-script=$t/c.ver -Wl,--undefined-version
-nm -g $t/exe2 > $t/log2
-! grep -q foo $t/log2 || false
+nm -g $t/exe2 | not grep foo
 
 cat <<'EOF' > $t/d.ver
 { local: *; };
 EOF
 
 $CC -B. -o $t/exe3 $t/a.o $t/b.so -Wl,--version-script=$t/d.ver
-nm -g $t/exe3 > $t/log3
-! grep -q foo $t/log3 || false
+nm -g $t/exe3 | not grep foo
diff --git a/test/version-script18.sh b/test/version-script18.sh
index a5ae3def08..d21620fa25 100755
--- a/test/version-script18.sh
+++ b/test/version-script18.sh
@@ -19,5 +19,5 @@ namespace libalpha {
 EOF
 
 $CC -B. -shared -Wl,--version-script=$t/a.ver -o $t/c.so $t/b.o
-readelf --wide --dyn-syms $t/c.so | grep libalpha | grep -q Bar
+readelf --wide --dyn-syms $t/c.so | grep libalpha | grep Bar
 
diff --git a/test/version-script19.sh b/test/version-script19.sh
index 45da96f18f..04c96f6848 100755
--- a/test/version-script19.sh
+++ b/test/version-script19.sh
@@ -12,5 +12,5 @@ EOF
 
 $CC -B. -shared -Wl,--version-script=$t/a.ver -o $t/c.so $t/b.o
 readelf -W --dyn-syms $t/c.so > $t/log
-! grep -Eq foobar $t/log || false
-grep -Eq 'GLOBAL.*baz' $t/log
+not grep -E foobar $t/log
+grep -E 'GLOBAL.*baz' $t/log
diff --git a/test/version-script2.sh b/test/version-script2.sh
index 2136169dbd..2af6a11d03 100755
--- a/test/version-script2.sh
+++ b/test/version-script2.sh
@@ -39,6 +39,6 @@ $CC -B. -o $t/exe $t/c.o $t/b.so
 $QEMU $t/exe
 
 readelf --dyn-syms $t/exe > $t/log
-grep -Fq 'foo@ver1' $t/log
-grep -Fq 'bar@ver2' $t/log
-grep -Fq 'baz@ver3' $t/log
+grep -F 'foo@ver1' $t/log
+grep -F 'bar@ver2' $t/log
+grep -F 'baz@ver3' $t/log
diff --git a/test/version-script20.sh b/test/version-script20.sh
index d0363f996f..d0c7193eed 100755
--- a/test/version-script20.sh
+++ b/test/version-script20.sh
@@ -14,6 +14,6 @@ EOF
 
 $CC -B. -shared -Wl,--version-script=$t/a.ver -o $t/c.so $t/b.o
 readelf -W --dyn-syms $t/c.so > $t/log
-grep -Fq 'foo_x@@VER2' $t/log
-grep -Fq 'foo_y@@VER1' $t/log
-grep -Fq 'foo_z@@VER1' $t/log
+grep -F 'foo_x@@VER2' $t/log
+grep -F 'foo_y@@VER1' $t/log
+grep -F 'foo_z@@VER1' $t/log
diff --git a/test/version-script21.sh b/test/version-script21.sh
index 3d75b7629d..7bd3adee53 100755
--- a/test/version-script21.sh
+++ b/test/version-script21.sh
@@ -14,6 +14,6 @@ EOF
 
 $CC -B. -shared -Wl,--version-script=$t/a.ver -o $t/c.so $t/b.o
 readelf -W --dyn-syms $t/c.so > $t/log
-grep -Fq 'foo_x@@VER1' $t/log
-grep -Fq 'foo_y@@VER2' $t/log
-grep -Fq 'foo_z@@VER2' $t/log
+grep -F 'foo_x@@VER1' $t/log
+grep -F 'foo_y@@VER2' $t/log
+grep -F 'foo_z@@VER2' $t/log
diff --git a/test/version-script22.sh b/test/version-script22.sh
index 1b17b4d80c..ffed368d86 100755
--- a/test/version-script22.sh
+++ b/test/version-script22.sh
@@ -12,4 +12,4 @@ EOF
 
 $CC -B. -shared -Wl,--version-script=$t/a.ver -o $t/c.so $t/b.o
 readelf -W --dyn-syms $t/c.so > $t/log
-grep -Fq 'foo_bar@@VER2' $t/log
+grep -F 'foo_bar@@VER2' $t/log
diff --git a/test/version-script23.sh b/test/version-script23.sh
index 3e1dd29d0e..158d7aaeba 100755
--- a/test/version-script23.sh
+++ b/test/version-script23.sh
@@ -12,4 +12,4 @@ EOF
 
 $CC -B. -shared -Wl,--version-script=$t/a.ver -o $t/c.so $t/b.o
 readelf -W --dyn-syms $t/c.so > $t/log
-grep -Fq 'foo?@@VER1' $t/log
+grep -F 'foo?@@VER1' $t/log
diff --git a/test/version-script3.sh b/test/version-script3.sh
index d4b02986af..0268422c67 100755
--- a/test/version-script3.sh
+++ b/test/version-script3.sh
@@ -35,6 +35,6 @@ $CC -B. -o $t/exe $t/c.o $t/b.so
 $QEMU $t/exe
 
 readelf --dyn-syms $t/exe > $t/log
-grep -Fq 'foo@ver1' $t/log
-grep -Fq 'bar@ver2' $t/log
-grep -Fq 'baz@ver2' $t/log
+grep -F 'foo@ver1' $t/log
+grep -F 'bar@ver2' $t/log
+grep -F 'baz@ver2' $t/log
diff --git a/test/version-script4.sh b/test/version-script4.sh
index c1dcaf2092..a5a4f82199 100755
--- a/test/version-script4.sh
+++ b/test/version-script4.sh
@@ -26,5 +26,5 @@ EOF
 $CC -B. -shared -o $t/c.so -Wl,-version-script,$t/a.ver $t/b.o
 
 readelf --dyn-syms $t/c.so > $t/log
-grep -Fq _ZN3foo3barE $t/log
-! grep -Fq ' bar' $t/log || false
+grep -F _ZN3foo3barE $t/log
+not grep -F ' bar' $t/log
diff --git a/test/version-script5.sh b/test/version-script5.sh
index 119569bac2..56d453d015 100755
--- a/test/version-script5.sh
+++ b/test/version-script5.sh
@@ -16,5 +16,5 @@ EOF
 $CC -B. -shared -o $t/c.so -Wl,-version-script,$t/a.ver $t/b.o
 
 readelf --dyn-syms $t/c.so > $t/log
-grep -Fq foo $t/log
-! grep -Fq ' main' $t/log || false
+grep -F foo $t/log
+not grep -F ' main' $t/log
diff --git a/test/version-script6.sh b/test/version-script6.sh
index 44f809ef31..5490347e3e 100755
--- a/test/version-script6.sh
+++ b/test/version-script6.sh
@@ -29,6 +29,6 @@ EOF
 $CC -B. -shared -Wl,-version-script,$t/d.ver -o $t/f.so $t/e.o $t/c.so -Wl,--undefined-version
 
 readelf --dyn-syms $t/f.so > $t/log
-grep -q 'foo@VER_X1' $t/log
-grep -q 'bar@VER_X2' $t/log
-grep -q 'baz@@VER_Y2' $t/log
+grep 'foo@VER_X1' $t/log
+grep 'bar@VER_X2' $t/log
+grep 'baz@@VER_Y2' $t/log
diff --git a/test/version-script7.sh b/test/version-script7.sh
index 9a50522eda..adc83f0b3f 100755
--- a/test/version-script7.sh
+++ b/test/version-script7.sh
@@ -13,5 +13,5 @@ EOF
 $CC -B. -shared -Wl,--version-script=$t/a.ver -o $t/c.so $t/b.o
 
 readelf --dyn-syms $t/c.so > $t/log
-grep -q 'foo$' $t/log
-grep -q 'bar@@VER_X1' $t/log
+grep 'foo$' $t/log
+grep 'bar@@VER_X1' $t/log
diff --git a/test/version-script8.sh b/test/version-script8.sh
index b48d7545cb..004ae3b256 100755
--- a/test/version-script8.sh
+++ b/test/version-script8.sh
@@ -33,6 +33,6 @@ $CC -B. -o $t/exe $t/c.o $t/b.so
 $QEMU $t/exe
 
 readelf --dyn-syms $t/b.so > $t/log
-grep -Fq 'foo@@ver1' $t/log
-grep -Fq 'bar@@ver2' $t/log
-! grep -Fq 'baz' $t/log || false
+grep -F 'foo@@ver1' $t/log
+grep -F 'bar@@ver2' $t/log
+not grep -F 'baz' $t/log
diff --git a/test/version-script9.sh b/test/version-script9.sh
index c472544280..f2dc304061 100755
--- a/test/version-script9.sh
+++ b/test/version-script9.sh
@@ -15,4 +15,4 @@ EOF
 
 $CC -B. -shared -o $t/c.so -Wl,-version-script,$t/a.ver $t/b.s
 readelf --dyn-syms $t/c.so > $t/log
-grep -q ' foo@@VER1$' $t/log
+grep ' foo@@VER1$' $t/log
diff --git a/test/version.sh b/test/version.sh
index 9fba3660f0..0a308f0910 100755
--- a/test/version.sh
+++ b/test/version.sh
@@ -2,14 +2,14 @@
 . $(dirname $0)/common.inc
 
 # OneTBB isn't tsan-clean
-nm mold | grep -q '__tsan_init' && skip
+nm mold | grep '__tsan_init' && skip
 
-./mold -v | grep -q 'mold .*compatible with GNU ld'
-./mold --version | grep -q 'mold .*compatible with GNU ld'
+./mold -v | grep 'mold .*compatible with GNU ld'
+./mold --version | grep 'mold .*compatible with GNU ld'
 
-./mold -V | grep -q 'mold .*compatible with GNU ld'
-./mold -V | grep -q elf_x86_64
-./mold -V | grep -q elf_i386
+./mold -V | grep 'mold .*compatible with GNU ld'
+./mold -V | grep elf_x86_64
+./mold -V | grep elf_i386
 
 cat <<EOF | $CC -c -xc -o $t/a.o -
 #include <stdio.h>
@@ -20,11 +20,10 @@ int main() {
 EOF
 
 rm -f $t/exe
-$CC -B. -Wl,--version -o $t/exe1 $t/a.o 2>&1 | grep -q mold
-! [ -f $t/exe1 ] || false
+$CC -B. -Wl,--version -o $t/exe1 $t/a.o |& grep mold
+not [ -f $t/exe1 ]
 
-$CC -B. -Wl,-v -o $t/exe2 $t/a.o 2>&1 | grep -q mold
-$QEMU $t/exe2 | grep -q 'Hello world'
+$CC -B. -Wl,-v -o $t/exe2 $t/a.o |& grep mold
+$QEMU $t/exe2 | grep 'Hello world'
 
-! ./mold --v >& $t/log
-grep -q 'unknown command line option:' $t/log
+not ./mold --v |& grep 'unknown command line option:'
diff --git a/test/versioned-undef.sh b/test/versioned-undef.sh
index 4424d2c737..20ccabc360 100755
--- a/test/versioned-undef.sh
+++ b/test/versioned-undef.sh
@@ -40,4 +40,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/d.o $t/c.so
-$QEMU $t/exe | grep -q '^1 2 3 3 4$'
+$QEMU $t/exe | grep '^1 2 3 3 4$'
diff --git a/test/visibility.sh b/test/visibility.sh
index b7bf79d9c2..d2a4870026 100755
--- a/test/visibility.sh
+++ b/test/visibility.sh
@@ -19,5 +19,4 @@ int main() { return bar; }
 EOF
 
 $CC -B. -shared -o $t/e.so $t/c.a $t/d.o
-readelf --dyn-syms $t/e.so > $t/log
-! grep -Fq foo $t/log || false
+readelf --dyn-syms $t/e.so | not grep -F foo
diff --git a/test/warn-common.sh b/test/warn-common.sh
index f92fd04921..d092ed54f8 100755
--- a/test/warn-common.sh
+++ b/test/warn-common.sh
@@ -13,8 +13,5 @@ int main() {
 }
 EOF
 
-$CC -B. -o $t/exe $t/a.o $t/b.o > $t/log
-! grep -Fq 'multiple common symbols' $t/log || false
-
-$CC -B. -o $t/exe $t/a.o $t/b.o -Wl,-warn-common 2> $t/log
-grep -Fq 'multiple common symbols' $t/log
+$CC -B. -o $t/exe $t/a.o $t/b.o | not grep -F 'multiple common symbols'
+$CC -B. -o $t/exe $t/a.o $t/b.o -Wl,-warn-common |& grep -F 'multiple common symbols'
diff --git a/test/warn-symbol-type.sh b/test/warn-symbol-type.sh
index 1c80ab0d7a..1c53dc3e1b 100755
--- a/test/warn-symbol-type.sh
+++ b/test/warn-symbol-type.sh
@@ -3,16 +3,11 @@
 
 cat <<EOF | $CC -c -fPIC -xc -o $t/a.o -
 #include <stdio.h>
-int times = -1; /* times collides with clock_t times(struct tms *buffer); */
-
-int
-main ()
-{
+int times = -1; // times collides with clock_t times(struct tms *buffer)
+int main() {
   printf ("times: %d\n", times);
-  return 0;
 }
 EOF
 
-$CC -B. -shared -o $t/a.so $t/a.o >& $t/log
-
-grep -q "warning: symbol type mismatch: times" $t/log
+$CC -B. -shared -o $t/a.so $t/a.o |&
+  grep 'warning: symbol type mismatch: times'
diff --git a/test/warn-unresolved-symbols.sh b/test/warn-unresolved-symbols.sh
index 2336d5bccf..186f0f061a 100755
--- a/test/warn-unresolved-symbols.sh
+++ b/test/warn-unresolved-symbols.sh
@@ -8,12 +8,10 @@ int main() {
 }
 EOF
 
-! $CC -B. -o $t/exe $t/a.o 2>&1 \
-  | grep -q 'undefined symbol:.*foo'
+not $CC -B. -o $t/exe $t/a.o |& grep 'undefined symbol:.*foo'
 
-$CC -B. -o $t/exe $t/a.o -Wl,-warn-unresolved-symbols 2>&1 \
-  | grep -q 'undefined symbol:.*foo'
+$CC -B. -o $t/exe $t/a.o -Wl,-warn-unresolved-symbols |&
+  grep 'undefined symbol:.*foo'
 
-! $CC -B. -o $t/exe $t/a.o -Wl,-warn-unresolved-symbols \
-  --error-unresolved-symbols 2>&1 \
-  | grep -q 'undefined symbol:.*foo'
+not $CC -B. -o $t/exe $t/a.o -Wl,-warn-unresolved-symbols \
+  -Wl,--error-unresolved-symbols |& grep 'undefined symbol:.*foo'
diff --git a/test/weak-export-dso.sh b/test/weak-export-dso.sh
index d99a2c608d..964e9d75fb 100755
--- a/test/weak-export-dso.sh
+++ b/test/weak-export-dso.sh
@@ -14,5 +14,5 @@ EOF
 $CC -B. -o $t/b.so $t/a.o -shared
 $CC -B. -o $t/c.so $t/a.o -shared -Wl,-z,defs
 
-readelf --dyn-syms $t/b.so | grep -q 'WEAK   DEFAULT  UND foo'
-readelf --dyn-syms $t/c.so | grep -q 'WEAK   DEFAULT  UND foo'
+readelf --dyn-syms $t/b.so | grep 'WEAK   DEFAULT  UND foo'
+readelf --dyn-syms $t/c.so | grep 'WEAK   DEFAULT  UND foo'
diff --git a/test/weak-export-dso2.sh b/test/weak-export-dso2.sh
index 5b1dee8eb3..0aad7911f2 100755
--- a/test/weak-export-dso2.sh
+++ b/test/weak-export-dso2.sh
@@ -18,4 +18,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/d.so $t/c.o $t/b.so -shared
-readelf -W --dyn-syms $t/d.so | grep -q 'WEAK   DEFAULT .* UND foo'
+readelf -W --dyn-syms $t/d.so | grep 'WEAK   DEFAULT .* UND foo'
diff --git a/test/weak-export-exe.sh b/test/weak-export-exe.sh
index 82110e6186..5385dc8c5a 100755
--- a/test/weak-export-exe.sh
+++ b/test/weak-export-exe.sh
@@ -12,5 +12,5 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o
-! readelf --dyn-syms $t/exe | grep -q 'WEAK   DEFAULT  UND foo' || false
-$QEMU $t/exe | grep -q '^3$'
+readelf --dyn-syms $t/exe | not grep 'WEAK   DEFAULT  UND foo'
+$QEMU $t/exe | grep '^3$'
diff --git a/test/weak-undef-dso.sh b/test/weak-undef-dso.sh
index db1788b0c7..9c73641501 100755
--- a/test/weak-undef-dso.sh
+++ b/test/weak-undef-dso.sh
@@ -15,7 +15,7 @@ int main() { printf("bar=%d\n", bar()); }
 EOF
 
 $CC -B. -o $t/exe1 $t/c.o $t/b.so
-$QEMU $t/exe1 | grep -q 'bar=-1'
+$QEMU $t/exe1 | grep 'bar=-1'
 
 cat <<EOF | $CC -xc -c -o $t/d.o -
 #include <stdio.h>
@@ -25,4 +25,4 @@ int main() { printf("bar=%d\n", bar()); }
 EOF
 
 $CC -B. -o $t/exe2 $t/d.o $t/b.so
-$QEMU $t/exe2 | grep -q 'bar=5'
+$QEMU $t/exe2 | grep 'bar=5'
diff --git a/test/weak-undef.sh b/test/weak-undef.sh
index e8e8667e6d..60a47ac593 100755
--- a/test/weak-undef.sh
+++ b/test/weak-undef.sh
@@ -26,7 +26,7 @@ $CC -B. -o $t/exe2 $t/b.o -no-pie
 $CC -B. -o $t/exe3 $t/a.o $t/c.o -pie
 $CC -B. -o $t/exe4 $t/b.o $t/c.o -no-pie
 
-$QEMU $t/exe1 | grep -q '^-1$'
-$QEMU $t/exe2 | grep -q '^-1$'
-$QEMU $t/exe3 | grep -q '^2$'
-$QEMU $t/exe4 | grep -q '^2$'
+$QEMU $t/exe1 | grep '^-1$'
+$QEMU $t/exe2 | grep '^-1$'
+$QEMU $t/exe3 | grep '^2$'
+$QEMU $t/exe4 | grep '^2$'
diff --git a/test/weak-undef2.sh b/test/weak-undef2.sh
index 80350f7740..d6594ea765 100755
--- a/test/weak-undef2.sh
+++ b/test/weak-undef2.sh
@@ -16,5 +16,4 @@ int foo();
 int bar() { return foo(); }
 EOF
 
-! $CC -B. -o $t/exe $t/a.o $t/b.o >& $t/log
-grep -q 'undefined symbol: foo' $t/log
+not $CC -B. -o $t/exe $t/a.o $t/b.o |& grep 'undefined symbol: foo'
diff --git a/test/weak-undef4.sh b/test/weak-undef4.sh
index 818b589257..3a9bc3c7cd 100755
--- a/test/weak-undef4.sh
+++ b/test/weak-undef4.sh
@@ -33,5 +33,5 @@ ar rcs $t/d.a $t/c.o
 $CC -B. -o $t/exe1 $t/a.o $t/d.a
 $CC -B. -o $t/exe2 $t/b.o $t/d.a
 
-$QEMU $t/exe1 | grep -q '^2$'
-$QEMU $t/exe2 | grep -q '^-1$'
+$QEMU $t/exe1 | grep '^2$'
+$QEMU $t/exe2 | grep '^-1$'
diff --git a/test/weak-undef5.sh b/test/weak-undef5.sh
index 887c8e931c..a5641eadc0 100755
--- a/test/weak-undef5.sh
+++ b/test/weak-undef5.sh
@@ -17,5 +17,5 @@ EOF
 $CC -B. -o $t/libfoobar.so $t/b.o -shared
 $CC -B. -o $t/exe $t/a.o -Wl,--as-needed -L$t -lfoobar -Wl,-rpath,$t
 
-readelf --dynamic $t/exe | grep -q 'NEEDED.*libfoobar'
-$QEMU $t/exe | grep -q '^2$'
+readelf --dynamic $t/exe | grep 'NEEDED.*libfoobar'
+$QEMU $t/exe | grep '^2$'
diff --git a/test/whole-archive.sh b/test/whole-archive.sh
index 721acf37c7..af0b8caec5 100755
--- a/test/whole-archive.sh
+++ b/test/whole-archive.sh
@@ -15,18 +15,18 @@ ar cr $t/d.a $t/b.o $t/c.o
 $CC -B. -nostdlib -o $t/exe $t/a.o $t/d.a
 
 readelf --symbols $t/exe > $t/log
-! grep -q fn1 $t/log || false
-! grep -q fn2 $t/log || false
+not grep fn1 $t/log
+not grep fn2 $t/log
 
 $CC -B. -nostdlib -o $t/exe $t/a.o -Wl,--whole-archive $t/d.a
 
 readelf --symbols $t/exe > $t/log
-grep -q fn1 $t/log
-grep -q fn2 $t/log
+grep fn1 $t/log
+grep fn2 $t/log
 
 $CC -B. -nostdlib -o $t/exe $t/a.o -Wl,--whole-archive \
   -Wl,--no-whole-archive $t/d.a
 
 readelf --symbols $t/exe > $t/log
-! grep -q fn1 $t/log || false
-! grep -q fn2 $t/log || false
+not grep fn1 $t/log
+not grep fn2 $t/log
diff --git a/test/wrap-lto.sh b/test/wrap-lto.sh
index 0e2fb52bce..bd205af1eb 100755
--- a/test/wrap-lto.sh
+++ b/test/wrap-lto.sh
@@ -36,10 +36,10 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.so $t/b.o -flto
-$QEMU $t/exe | grep -q '^foo$'
+$QEMU $t/exe | grep '^foo$'
 
 $CC -B. -o $t/exe $t/a.so $t/b.o -Wl,-wrap,foo -flto
-$QEMU $t/exe | grep -q '^wrap_foo$'
+$QEMU $t/exe | grep '^wrap_foo$'
 
 $CC -B. -o $t/exe $t/a.so $t/c.o -Wl,-wrap,foo -flto
-$QEMU $t/exe | grep -q '^foo$'
+$QEMU $t/exe | grep '^foo$'
diff --git a/test/wrap.sh b/test/wrap.sh
index 6576fcc5f0..6a841b31ba 100755
--- a/test/wrap.sh
+++ b/test/wrap.sh
@@ -34,10 +34,10 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.so $t/b.o
-$QEMU $t/exe | grep -q '^foo$'
+$QEMU $t/exe | grep '^foo$'
 
 $CC -B. -o $t/exe $t/a.so $t/b.o -Wl,-wrap,foo
-$QEMU $t/exe | grep -q '^wrap_foo$'
+$QEMU $t/exe | grep '^wrap_foo$'
 
 $CC -B. -o $t/exe $t/a.so $t/c.o -Wl,-wrap,foo
-$QEMU $t/exe | grep -q '^foo$'
+$QEMU $t/exe | grep '^foo$'
diff --git a/test/z-cet-report.sh b/test/z-cet-report.sh
index a0d90d6072..dc0b15655d 100755
--- a/test/z-cet-report.sh
+++ b/test/z-cet-report.sh
@@ -9,9 +9,9 @@ EOF
 $CC -B. -o $t/exe $t/a.o
 
 $CC -B. -o $t/exe $t/a.o -Wl,-z,cet-report=warning >& $t/log
-grep -q 'a.o: -cet-report=warning: missing GNU_PROPERTY_X86_FEATURE_1_IBT' $t/log
-grep -q 'a.o: -cet-report=warning: missing GNU_PROPERTY_X86_FEATURE_1_SHSTK' $t/log
+grep 'a.o: -cet-report=warning: missing GNU_PROPERTY_X86_FEATURE_1_IBT' $t/log
+grep 'a.o: -cet-report=warning: missing GNU_PROPERTY_X86_FEATURE_1_SHSTK' $t/log
 
-! $CC -B. -o $t/exe $t/a.o -Wl,-z,cet-report=error >& $t/log
-grep -q 'a.o: -cet-report=error: missing GNU_PROPERTY_X86_FEATURE_1_IBT' $t/log
-grep -q 'a.o: -cet-report=error: missing GNU_PROPERTY_X86_FEATURE_1_SHSTK' $t/log
+not $CC -B. -o $t/exe $t/a.o -Wl,-z,cet-report=error >& $t/log
+grep 'a.o: -cet-report=error: missing GNU_PROPERTY_X86_FEATURE_1_IBT' $t/log
+grep 'a.o: -cet-report=error: missing GNU_PROPERTY_X86_FEATURE_1_SHSTK' $t/log
diff --git a/test/z-defs.sh b/test/z-defs.sh
index af68bab0ec..7670b7a053 100755
--- a/test/z-defs.sh
+++ b/test/z-defs.sh
@@ -9,11 +9,11 @@ EOF
 $CC -B. -shared -o $t/b.so $t/a.o
 $CC -B. -shared -o $t/b.so $t/a.o -Wl,-z,undefs
 
-! $CC -B. -shared -o $t/b.so $t/a.o -Wl,-z,defs 2> $t/log || false
-grep -q 'undefined symbol:.* foo' $t/log
+not $CC -B. -shared -o $t/b.so $t/a.o -Wl,-z,defs |&
+  grep 'undefined symbol:.* foo'
 
-! $CC -B. -shared -o $t/b.so $t/a.o -Wl,-no-undefined 2> $t/log || false
-grep -q 'undefined symbol:.* foo' $t/log
+not $CC -B. -shared -o $t/b.so $t/a.o -Wl,-no-undefined |&
+  grep 'undefined symbol:.* foo'
 
-$CC -B. -shared -o $t/c.so $t/a.o -Wl,-z,defs -Wl,--warn-unresolved-symbols 2> $t/log
-grep -q 'undefined symbol:.* foo$' $t/log
+$CC -B. -shared -o $t/c.so $t/a.o -Wl,-z,defs -Wl,--warn-unresolved-symbols |&
+  grep 'undefined symbol:.* foo$'
diff --git a/test/z-dynamic-undefined-weak-exe.sh b/test/z-dynamic-undefined-weak-exe.sh
new file mode 100755
index 0000000000..4a91c87a3d
--- /dev/null
+++ b/test/z-dynamic-undefined-weak-exe.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -o $t/a.o -c -xc - -fPIC
+__attribute__((weak)) void fn();
+int main() { fn(); }
+EOF
+
+$CC -B. -o $t/exe1 $t/a.o -pie
+readelf -W --dyn-syms $t/exe1 | not grep ' fn$'
+
+$CC -B. -o $t/exe2 $t/a.o -pie -Wl,-z,dynamic-undefined-weak
+readelf -W --dyn-syms $t/exe2 | grep ' fn$'
diff --git a/test/z-dynamic-undefined-weak.sh b/test/z-dynamic-undefined-weak.sh
index 3bc2169975..031ce1e6c7 100755
--- a/test/z-dynamic-undefined-weak.sh
+++ b/test/z-dynamic-undefined-weak.sh
@@ -18,10 +18,10 @@ int main() { printf("%d\n", bar()); }
 EOF
 
 $CC -B. -o $t/exe1 $t/e.o $t/b.so
-$QEMU $t/exe1 | grep -q 1
+$QEMU $t/exe1 | grep 1
 
 $CC -B. -o $t/exe2 $t/e.o $t/c.so
-$QEMU $t/exe2 | grep -q 1
+$QEMU $t/exe2 | grep 1
 
 $CC -B. -o $t/exe3 $t/e.o $t/d.so
-$QEMU $t/exe3 | grep -q 0
+$QEMU $t/exe3 | grep 0
diff --git a/test/z-max-page-size.sh b/test/z-max-page-size.sh
index 52e8738305..347de94e2f 100755
--- a/test/z-max-page-size.sh
+++ b/test/z-max-page-size.sh
@@ -11,16 +11,16 @@ EOF
 $CC -B. -o $t/exe1 $t/a.o -Wl,-z,max-page-size=65536 \
   -Wl,-z,separate-loadable-segments
 
-$QEMU $t/exe1 | grep -q 'Hello world'
-readelf -W --segments $t/exe1 | grep -q 'LOAD.*R   0x10000$'
+$QEMU $t/exe1 | grep 'Hello world'
+readelf -W --segments $t/exe1 | grep 'LOAD.*R   0x10000$'
 
 $CC -B. -o $t/exe2 $t/a.o -Wl,-zmax-page-size=$((1024*1024)) \
   -Wl,-z,separate-loadable-segments
 
-$QEMU $t/exe2 | grep -q 'Hello world'
-readelf -W --segments $t/exe2 | grep -q 'LOAD.*R   0x100000$'
+$QEMU $t/exe2 | grep 'Hello world'
+readelf -W --segments $t/exe2 | grep 'LOAD.*R   0x100000$'
 
 $CC -B. -o $t/exe3 $t/a.o -Wl,-zmax-page-size=$((1024*1024))
 
-$QEMU $t/exe3 | grep -q 'Hello world'
-readelf -W --segments $t/exe3 | grep -q 'LOAD.*R   0x100000$'
+$QEMU $t/exe3 | grep 'Hello world'
+readelf -W --segments $t/exe3 | grep 'LOAD.*R   0x100000$'
diff --git a/test/z-nodefaultlib.sh b/test/z-nodefaultlib.sh
index 10ef896acf..ef66d8e930 100755
--- a/test/z-nodefaultlib.sh
+++ b/test/z-nodefaultlib.sh
@@ -9,4 +9,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o -Wl,-z,nodefaultlib
-readelf --dynamic $t/exe | grep -q 'Flags:.*NODEFLIB'
+readelf --dynamic $t/exe | grep 'Flags:.*NODEFLIB'
diff --git a/test/z-nodump.sh b/test/z-nodump.sh
index 0de96e7c73..fa86214fd6 100755
--- a/test/z-nodump.sh
+++ b/test/z-nodump.sh
@@ -6,7 +6,7 @@ void foo() {}
 EOF
 
 $CC -B. -shared -o $t/b.so $t/a.o
-! readelf --dynamic $t/b.so | grep -Eq 'Flags:.*NODUMP' || false
+readelf --dynamic $t/b.so | not grep -E 'Flags:.*NODUMP'
 
 $CC -B. -shared -o $t/b.so $t/a.o -Wl,-z,nodump
-readelf --dynamic $t/b.so | grep -Eq 'Flags:.*NODUMP'
+readelf --dynamic $t/b.so | grep -E 'Flags:.*NODUMP'
diff --git a/test/z-now.sh b/test/z-now.sh
index ce361a296a..202b07392a 100755
--- a/test/z-now.sh
+++ b/test/z-now.sh
@@ -10,4 +10,4 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o -Wl,-z,now
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
diff --git a/test/z-origin.sh b/test/z-origin.sh
index c7547548da..825bedab2d 100755
--- a/test/z-origin.sh
+++ b/test/z-origin.sh
@@ -11,5 +11,5 @@ EOF
 
 $CC -B. -o $t/exe $t/a.o -Wl,-z,origin
 
-readelf --dynamic $t/exe | grep -Eq '\(FLAGS\)\s+ORIGIN'
-readelf --dynamic $t/exe | grep -Eq 'Flags:.*ORIGIN'
+readelf --dynamic $t/exe | grep -E '\(FLAGS\)\s+ORIGIN'
+readelf --dynamic $t/exe | grep -E 'Flags:.*ORIGIN'
diff --git a/test/z-pack-relative-relocs.sh b/test/z-pack-relative-relocs.sh
index 357bb859e2..9677bb6abc 100755
--- a/test/z-pack-relative-relocs.sh
+++ b/test/z-pack-relative-relocs.sh
@@ -9,13 +9,13 @@ int main() {
 EOF
 
 $CC -o $t/exe1 $t/a.o -pie -Wl,-z,pack-relative-relocs 2> /dev/null || skip
-readelf -WS $t/exe1 | grep -Fq .relr.dyn || skip
-$QEMU $t/exe1 2> /dev/null | grep -q Hello || skip
+readelf -WS $t/exe1 | grep -F .relr.dyn || skip
+$QEMU $t/exe1 2> /dev/null | grep Hello || skip
 
 $CC -B. -o $t/exe2 $t/a.o -pie -Wl,-z,pack-relative-relocs
-$QEMU $t/exe2 | grep -q Hello
+$QEMU $t/exe2 | grep Hello
 
 readelf --dynamic $t/exe2 > $t/log2
-grep -wq RELR $t/log2
-grep -wq RELRSZ $t/log2
-grep -wq RELRENT $t/log2
+grep -Ew 'RELR|<unknown>: 24' $t/log2
+grep -Ew 'RELRSZ|<unknown>: 23' $t/log2
+grep -Ew 'RELRENT|<unknown>: 25' $t/log2
diff --git a/test/z-rodynamic.sh b/test/z-rodynamic.sh
index 44013dad40..d9c9a89a56 100755
--- a/test/z-rodynamic.sh
+++ b/test/z-rodynamic.sh
@@ -6,7 +6,7 @@ int main() {}
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o
-readelf -WS $t/exe1 | grep -q '\.dynamic.* WA '
+readelf -WS $t/exe1 | grep '\.dynamic.* WA '
 
 $CC -B. -o $t/exe2 $t/a.o -Wl,-z,rodynamic
-readelf -WS $t/exe2 | grep -q '\.dynamic.* A '
+readelf -WS $t/exe2 | grep '\.dynamic.* A '
diff --git a/test/z-sectionheader.sh b/test/z-sectionheader.sh
index 2c9c2318a1..7a8efe6337 100755
--- a/test/z-sectionheader.sh
+++ b/test/z-sectionheader.sh
@@ -10,6 +10,6 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe $t/a.o -Wl,-z,nosectionheader
-$QEMU $t/exe | grep -q 'Hello world'
+$QEMU $t/exe | grep 'Hello world'
 
-readelf -h $t/exe 2>&1 | grep -Eq 'Size of section headers:\s+0 '
+readelf -h $t/exe |& grep -E 'Size of section headers:\s+0 '
diff --git a/test/z-separate-code.sh b/test/z-separate-code.sh
index fcc162bd24..40f02185e0 100755
--- a/test/z-separate-code.sh
+++ b/test/z-separate-code.sh
@@ -12,10 +12,10 @@ int main() {
 EOF
 
 $CC -B. -o $t/exe1 $t/a.o -Wl,-z,separate-loadable-segments
-$QEMU $t/exe1 | grep -q 'Hello world'
+$QEMU $t/exe1 | grep 'Hello world'
 
 $CC -B. -o $t/exe2 $t/a.o -Wl,-z,separate-code -Wl,-z,norelro
-$QEMU $t/exe2 | grep -q 'Hello world'
+$QEMU $t/exe2 | grep 'Hello world'
 
 $CC -B. -o $t/exe3 $t/a.o -Wl,-z,noseparate-code -Wl,-z,norelro
-$QEMU $t/exe3 | grep -q 'Hello world'
+$QEMU $t/exe3 | grep 'Hello world'
diff --git a/test/z-stack-size.sh b/test/z-stack-size.sh
index d3fe717504..daba61c9b5 100755
--- a/test/z-stack-size.sh
+++ b/test/z-stack-size.sh
@@ -6,4 +6,4 @@ int main() {}
 EOF
 
 $CC -B. -o $t/exe $t/a.o -Wl,-z,stack-size=0x900000
-readelf -W --segments $t/exe | grep -q 'GNU_STACK .* 0x900000 RW'
+readelf -W --segments $t/exe | grep 'GNU_STACK .* 0x900000 RW'
diff --git a/test/z-start-stop-visibility.sh b/test/z-start-stop-visibility.sh
index cf44248039..a39e6ce997 100755
--- a/test/z-start-stop-visibility.sh
+++ b/test/z-start-stop-visibility.sh
@@ -14,15 +14,15 @@ EOF
 
 $CC -B. -o $t/exe1 $t/a.o
 readelf -W --dyn-syms $t/exe1 > $t/log1
-! grep -q __start_hello $t/log1 || false
-! grep -q __stop_hello $t/log1 || false
+not grep __start_hello $t/log1
+not grep __stop_hello $t/log1
 
 $CC -B. -o $t/exe2 $t/a.o -Wl,-z,start-stop-visibility=hidden
 readelf -W --dyn-syms $t/exe2 > $t/log2
-! grep -q __start_hello $t/log2 || false
-! grep -q __stop_hello $t/log2 || false
+not grep __start_hello $t/log2
+not grep __stop_hello $t/log2
 
 $CC -B. -o $t/exe3 $t/a.o -Wl,-z,start-stop-visibility=protected
 readelf -W --dyn-syms $t/exe3 > $t/log3
-grep -q __start_hello $t/log3
-grep -q __stop_hello $t/log3
+grep __start_hello $t/log3
+grep __stop_hello $t/log3
diff --git a/test/z-unknown.sh b/test/z-unknown.sh
deleted file mode 100755
index c2531462e4..0000000000
--- a/test/z-unknown.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-. $(dirname $0)/common.inc
-
-./mold -z no-such-opt 2>&1 | grep -q 'unknown command line option: -z no-such-opt'
-./mold -zno-such-opt 2>&1 | grep -q 'unknown command line option: -zno-such-opt'
diff --git a/third-party/blake3/.github/workflows/ci.yml b/third-party/blake3/.github/workflows/ci.yml
index e93ecb3835..e0dcd33d32 100644
--- a/third-party/blake3/.github/workflows/ci.yml
+++ b/third-party/blake3/.github/workflows/ci.yml
@@ -34,7 +34,7 @@ jobs:
           # https://github.com/rust-lang/libs-team/issues/72.
           # This test target is here so that we notice if we accidentally bump
           # the MSRV, but it's not a promise that we won't bump it.
-          "1.66.1",
+          "1.70.0",
         ]
 
     steps:
@@ -126,22 +126,31 @@ jobs:
     - name: cargo test C bindings intrinsics
       run: cargo test --features=prefer_intrinsics
       working-directory: ./c/blake3_c_rust_bindings
+    - name: cargo test C bindings no AVX-512
+      run: cargo test
+      working-directory: ./c/blake3_c_rust_bindings
+      env:
+        CFLAGS: -DBLAKE3_NO_AVX512
+    - name: cargo test C bindings no AVX2
+      run: cargo test
+      working-directory: ./c/blake3_c_rust_bindings
+      env:
+        CFLAGS: -DBLAKE3_NO_AVX512 -DBLAKE3_NO_AVX2
+    - name: cargo test C bindings no SSE41
+      run: cargo test
+      working-directory: ./c/blake3_c_rust_bindings
+      env:
+        CFLAGS: -DBLAKE3_NO_AVX512 -DBLAKE3_NO_AVX2 -DBLAKE3_NO_SSE41
+    - name: cargo test C bindings no SSE2
+      run: cargo test
+      working-directory: ./c/blake3_c_rust_bindings
+      env:
+        CFLAGS: -DBLAKE3_NO_AVX512 -DBLAKE3_NO_AVX2 -DBLAKE3_NO_SSE41 -DBLAKE3_NO_SSE2
     # Reference impl doc test.
     - name: reference impl doc test
       run: cargo test
       working-directory: ./reference_impl
 
-    # the new guts crate
-    - name: guts test
-      run: cargo test --all-features
-      working-directory: ./rust/guts
-    - name: guts no_std build
-      run: cargo build --no-default-features
-      working-directory: ./rust/guts
-    - name: guts no_std test  # note that rust/guts/src/test.rs still uses libstd
-      run: cargo test --no-default-features
-      working-directory: ./rust/guts
-
   b3sum_tests:
     name: b3sum ${{ matrix.target.name }} ${{ matrix.channel }}
     runs-on: ${{ matrix.target.os }}
diff --git a/third-party/blake3/Cargo.toml b/third-party/blake3/Cargo.toml
index 55eb8a41df..8b1ff7f37e 100644
--- a/third-party/blake3/Cargo.toml
+++ b/third-party/blake3/Cargo.toml
@@ -1,10 +1,10 @@
 [package]
 name = "blake3"
-version = "1.5.1"
+version = "1.5.5"
 authors = ["Jack O'Connor <oconnor663@gmail.com>", "Samuel Neves"]
 description = "the BLAKE3 hash function"
 repository = "https://github.com/BLAKE3-team/BLAKE3"
-license = "CC0-1.0 OR Apache-2.0"
+license = "CC0-1.0 OR Apache-2.0 OR Apache-2.0 WITH LLVM-exception"
 documentation = "https://docs.rs/blake3"
 readme = "README.md"
 edition = "2021"
@@ -29,7 +29,10 @@ std = []
 # `update_rayon` and (in combination with `mmap` below) `update_mmap_rayon`
 # methods, for multithreaded hashing. However, even if this feature is enabled,
 # all other APIs remain single-threaded.
-rayon = ["dep:rayon", "std"]
+#
+# Implementation detail: We take a dependency on rayon-core instead of rayon,
+# because it builds faster and still includes all the APIs we need.
+rayon = ["dep:rayon-core", "std"]
 
 # The `mmap` feature (disabled by default, but enabled for docs.rs) adds the
 # `update_mmap` and (in combination with `rayon` above) `update_mmap_rayon`
@@ -46,12 +49,21 @@ zeroize = ["dep:zeroize", "arrayvec/zeroize"]
 # who use it should expect breaking changes between patch versions of this
 # crate. (The "*-preview" feature name follows the conventions of the RustCrypto
 # "signature" crate.)
-traits-preview = ["digest"]
+traits-preview = ["dep:digest"]
 
 # ---------- Features below this line are undocumented and unstable. ----------
 # The following features are mainly intended for testing and benchmarking, and
 # they might change or disappear at any time without a major version bump.
 
+# It wasn't originally intended to expose "digest" as its own feature, but the
+# traits-preview feature above predated the "dep:" syntax in Cargo. Version
+# 1.5.2 of this crate started using "dep:" syntax, but that broke some callers
+# in the wild (https://solana.stackexchange.com/q/17787/29050). This feature
+# unbreaks those callers. When Cargo gains the ability to deprecate features,
+# this feature will be deprecated. Note that the relevant trait implementations
+# are still gated by "traits-preview".
+digest = ["dep:digest"]
+
 # By default on x86_64, this crate uses Samuel Neves' hand-written assembly
 # implementations for SSE4.1, AVX2, and AVX512. (These provide both the best
 # runtime performance, and the fastest build times.) And by default on 32-bit
@@ -94,13 +106,13 @@ features = ["mmap", "rayon", "serde", "zeroize"]
 [dependencies]
 arrayref = "0.3.5"
 arrayvec = { version = "0.7.4", default-features = false }
-constant_time_eq = "0.3.0"
+constant_time_eq = { version = "0.3.1", default-features = false }
 cfg-if = "1.0.0"
 digest = { version = "0.10.1", features = [ "mac" ], optional = true }
 memmap2 = { version = "0.9", optional = true }
-rayon = { version = "1.2.1", optional = true }
+rayon-core = { version = "1.12.1", optional = true }
 serde = { version = "1.0", default-features = false, features = ["derive"], optional = true }
-zeroize = { version = "1", default-features = false, features = ["zeroize_derive"], optional = true }
+zeroize = { version = "1", default-features = false, optional = true }
 
 [dev-dependencies]
 hmac = "0.12.0"
@@ -111,6 +123,7 @@ rand_chacha = "0.3.0"
 reference_impl = { path = "./reference_impl" }
 tempfile = "3.8.0"
 serde_json = "1.0.107"
+ciborium = "0.2.2"
 
 [build-dependencies]
-cc = "1.0.4"
+cc = "1.1.12"
diff --git a/LICENSE.third-party b/third-party/blake3/LICENSE_A2
similarity index 63%
rename from LICENSE.third-party
rename to third-party/blake3/LICENSE_A2
index f4e5606ef2..d512ca94d0 100644
--- a/LICENSE.third-party
+++ b/third-party/blake3/LICENSE_A2
@@ -1,61 +1,3 @@
-Mold includes a number of subcomponents with separate copyright notices
-and license terms. Your use of the source code for the these subcomponents
-is subject to the terms and conditions of the following licenses.
-
-
-=== For the third-party/mimalloc component:
-
-MIT License
-
-Copyright (c) 2018-2021 Microsoft Corporation, Daan Leijen
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-
-=== For the third-party/rust-demangle component:
-
-Permission is hereby granted, free of charge, to any
-person obtaining a copy of this software and associated
-documentation files (the "Software"), to deal in the
-Software without restriction, including without
-limitation the rights to use, copy, modify, merge,
-publish, distribute, sublicense, and/or sell copies of
-the Software, and to permit persons to whom the Software
-is furnished to do so, subject to the following
-conditions:
-
-The above copyright notice and this permission notice
-shall be included in all copies or substantial portions
-of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
-ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
-TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
-PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
-SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
-IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-
-
-=== For the third-party/tbb component:
 
                                  Apache License
                            Version 2.0, January 2004
@@ -245,7 +187,7 @@ DEALINGS IN THE SOFTWARE.
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright [yyyy] [name of copyright owner]
+   Copyright 2019 Jack O'Connor and Samuel Neves
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
@@ -258,91 +200,3 @@ DEALINGS IN THE SOFTWARE.
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.
-
-
-=== For the third-party/xxhash component:
-
-xxHash Library
-Copyright (c) 2012-2020 Yann Collet
-All rights reserved.
-
-BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-* Redistributions of source code must retain the above copyright notice, this
-  list of conditions and the following disclaimer.
-
-* Redistributions in binary form must reproduce the above copyright notice, this
-  list of conditions and the following disclaimer in the documentation and/or
-  other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-=== For the third-party/zlib component:
-
- (C) 1995-2022 Jean-loup Gailly and Mark Adler
-
-  This software is provided 'as-is', without any express or implied
-  warranty.  In no event will the authors be held liable for any damages
-  arising from the use of this software.
-
-  Permission is granted to anyone to use this software for any purpose,
-  including commercial applications, and to alter it and redistribute it
-  freely, subject to the following restrictions:
-
-  1. The origin of this software must not be misrepresented; you must not
-     claim that you wrote the original software. If you use this software
-     in a product, an acknowledgment in the product documentation would be
-     appreciated but is not required.
-  2. Altered source versions must be plainly marked as such, and must not be
-     misrepresented as being the original software.
-  3. This notice may not be removed or altered from any source distribution.
-
-  Jean-loup Gailly        Mark Adler
-  jloup@gzip.org          madler@alumni.caltech.edu
-
-
-=== For the third-party/zstd component:
-
-BSD License
-
-For Zstandard software
-
-Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
- * Neither the name Facebook nor the names of its contributors may be used to
-   endorse or promote products derived from this software without specific
-   prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/third-party/blake3/LICENSE b/third-party/blake3/LICENSE_A2LLVM
similarity index 53%
rename from third-party/blake3/LICENSE
rename to third-party/blake3/LICENSE_A2LLVM
index f5892efc3b..8d11bec0e8 100644
--- a/third-party/blake3/LICENSE
+++ b/third-party/blake3/LICENSE_A2LLVM
@@ -1,139 +1,10 @@
-This work is released into the public domain with CC0 1.0. Alternatively, it is
-licensed under the Apache License 2.0.
-
--------------------------------------------------------------------------------
-
-Creative Commons Legal Code
-
-CC0 1.0 Universal
-
-    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
-    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
-    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
-    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
-    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
-    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
-    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
-    HEREUNDER.
-
-Statement of Purpose
-
-The laws of most jurisdictions throughout the world automatically confer
-exclusive Copyright and Related Rights (defined below) upon the creator
-and subsequent owner(s) (each and all, an "owner") of an original work of
-authorship and/or a database (each, a "Work").
-
-Certain owners wish to permanently relinquish those rights to a Work for
-the purpose of contributing to a commons of creative, cultural and
-scientific works ("Commons") that the public can reliably and without fear
-of later claims of infringement build upon, modify, incorporate in other
-works, reuse and redistribute as freely as possible in any form whatsoever
-and for any purposes, including without limitation commercial purposes.
-These owners may contribute to the Commons to promote the ideal of a free
-culture and the further production of creative, cultural and scientific
-works, or to gain reputation or greater distribution for their Work in
-part through the use and efforts of others.
-
-For these and/or other purposes and motivations, and without any
-expectation of additional consideration or compensation, the person
-associating CC0 with a Work (the "Affirmer"), to the extent that he or she
-is an owner of Copyright and Related Rights in the Work, voluntarily
-elects to apply CC0 to the Work and publicly distribute the Work under its
-terms, with knowledge of his or her Copyright and Related Rights in the
-Work and the meaning and intended legal effect of CC0 on those rights.
-
-1. Copyright and Related Rights. A Work made available under CC0 may be
-protected by copyright and related or neighboring rights ("Copyright and
-Related Rights"). Copyright and Related Rights include, but are not
-limited to, the following:
-
-  i. the right to reproduce, adapt, distribute, perform, display,
-     communicate, and translate a Work;
- ii. moral rights retained by the original author(s) and/or performer(s);
-iii. publicity and privacy rights pertaining to a person's image or
-     likeness depicted in a Work;
- iv. rights protecting against unfair competition in regards to a Work,
-     subject to the limitations in paragraph 4(a), below;
-  v. rights protecting the extraction, dissemination, use and reuse of data
-     in a Work;
- vi. database rights (such as those arising under Directive 96/9/EC of the
-     European Parliament and of the Council of 11 March 1996 on the legal
-     protection of databases, and under any national implementation
-     thereof, including any amended or successor version of such
-     directive); and
-vii. other similar, equivalent or corresponding rights throughout the
-     world based on applicable law or treaty, and any national
-     implementations thereof.
-
-2. Waiver. To the greatest extent permitted by, but not in contravention
-of, applicable law, Affirmer hereby overtly, fully, permanently,
-irrevocably and unconditionally waives, abandons, and surrenders all of
-Affirmer's Copyright and Related Rights and associated claims and causes
-of action, whether now known or unknown (including existing as well as
-future claims and causes of action), in the Work (i) in all territories
-worldwide, (ii) for the maximum duration provided by applicable law or
-treaty (including future time extensions), (iii) in any current or future
-medium and for any number of copies, and (iv) for any purpose whatsoever,
-including without limitation commercial, advertising or promotional
-purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
-member of the public at large and to the detriment of Affirmer's heirs and
-successors, fully intending that such Waiver shall not be subject to
-revocation, rescission, cancellation, termination, or any other legal or
-equitable action to disrupt the quiet enjoyment of the Work by the public
-as contemplated by Affirmer's express Statement of Purpose.
-
-3. Public License Fallback. Should any part of the Waiver for any reason
-be judged legally invalid or ineffective under applicable law, then the
-Waiver shall be preserved to the maximum extent permitted taking into
-account Affirmer's express Statement of Purpose. In addition, to the
-extent the Waiver is so judged Affirmer hereby grants to each affected
-person a royalty-free, non transferable, non sublicensable, non exclusive,
-irrevocable and unconditional license to exercise Affirmer's Copyright and
-Related Rights in the Work (i) in all territories worldwide, (ii) for the
-maximum duration provided by applicable law or treaty (including future
-time extensions), (iii) in any current or future medium and for any number
-of copies, and (iv) for any purpose whatsoever, including without
-limitation commercial, advertising or promotional purposes (the
-"License"). The License shall be deemed effective as of the date CC0 was
-applied by Affirmer to the Work. Should any part of the License for any
-reason be judged legally invalid or ineffective under applicable law, such
-partial invalidity or ineffectiveness shall not invalidate the remainder
-of the License, and in such case Affirmer hereby affirms that he or she
-will not (i) exercise any of his or her remaining Copyright and Related
-Rights in the Work or (ii) assert any associated claims and causes of
-action with respect to the Work, in either case contrary to Affirmer's
-express Statement of Purpose.
-
-4. Limitations and Disclaimers.
-
- a. No trademark or patent rights held by Affirmer are waived, abandoned,
-    surrendered, licensed or otherwise affected by this document.
- b. Affirmer offers the Work as-is and makes no representations or
-    warranties of any kind concerning the Work, express, implied,
-    statutory or otherwise, including without limitation warranties of
-    title, merchantability, fitness for a particular purpose, non
-    infringement, or the absence of latent or other defects, accuracy, or
-    the present or absence of errors, whether or not discoverable, all to
-    the greatest extent permissible under applicable law.
- c. Affirmer disclaims responsibility for clearing rights of other persons
-    that may apply to the Work or any use thereof, including without
-    limitation any person's Copyright and Related Rights in the Work.
-    Further, Affirmer disclaims responsibility for obtaining any necessary
-    consents, permissions or other rights required for any use of the
-    Work.
- d. Affirmer understands and acknowledges that Creative Commons is not a
-    party to this document and has no duty or obligation with respect to
-    this CC0 or use of the Work.
-
--------------------------------------------------------------------------------
-
                                  Apache License
                            Version 2.0, January 2004
                         http://www.apache.org/licenses/
 
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
 
-   1. Definitions.
+    1. Definitions.
 
       "License" shall mean the terms and conditions for use, reproduction,
       and distribution as defined by Sections 1 through 9 of this document.
@@ -192,14 +63,14 @@ express Statement of Purpose.
       on behalf of whom a Contribution has been received by Licensor and
       subsequently incorporated within the Work.
 
-   2. Grant of Copyright License. Subject to the terms and conditions of
+    2. Grant of Copyright License. Subject to the terms and conditions of
       this License, each Contributor hereby grants to You a perpetual,
       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
       copyright license to reproduce, prepare Derivative Works of,
       publicly display, publicly perform, sublicense, and distribute the
       Work and such Derivative Works in Source or Object form.
 
-   3. Grant of Patent License. Subject to the terms and conditions of
+    3. Grant of Patent License. Subject to the terms and conditions of
       this License, each Contributor hereby grants to You a perpetual,
       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
       (except as stated in this section) patent license to make, have made,
@@ -215,7 +86,7 @@ express Statement of Purpose.
       granted to You under this License for that Work shall terminate
       as of the date such litigation is filed.
 
-   4. Redistribution. You may reproduce and distribute copies of the
+    4. Redistribution. You may reproduce and distribute copies of the
       Work or Derivative Works thereof in any medium, with or without
       modifications, and in Source or Object form, provided that You
       meet the following conditions:
@@ -256,7 +127,7 @@ express Statement of Purpose.
       reproduction, and distribution of the Work otherwise complies with
       the conditions stated in this License.
 
-   5. Submission of Contributions. Unless You explicitly state otherwise,
+    5. Submission of Contributions. Unless You explicitly state otherwise,
       any Contribution intentionally submitted for inclusion in the Work
       by You to the Licensor shall be under the terms and conditions of
       this License, without any additional terms or conditions.
@@ -264,12 +135,12 @@ express Statement of Purpose.
       the terms of any separate license agreement you may have executed
       with Licensor regarding such Contributions.
 
-   6. Trademarks. This License does not grant permission to use the trade
+    6. Trademarks. This License does not grant permission to use the trade
       names, trademarks, service marks, or product names of the Licensor,
       except as required for reasonable and customary use in describing the
       origin of the Work and reproducing the content of the NOTICE file.
 
-   7. Disclaimer of Warranty. Unless required by applicable law or
+    7. Disclaimer of Warranty. Unless required by applicable law or
       agreed to in writing, Licensor provides the Work (and each
       Contributor provides its Contributions) on an "AS IS" BASIS,
       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
@@ -279,7 +150,7 @@ express Statement of Purpose.
       appropriateness of using or redistributing the Work and assume any
       risks associated with Your exercise of permissions under this License.
 
-   8. Limitation of Liability. In no event and under no legal theory,
+    8. Limitation of Liability. In no event and under no legal theory,
       whether in tort (including negligence), contract, or otherwise,
       unless required by applicable law (such as deliberate and grossly
       negligent acts) or agreed to in writing, shall any Contributor be
@@ -291,7 +162,7 @@ express Statement of Purpose.
       other commercial damages or losses), even if such Contributor
       has been advised of the possibility of such damages.
 
-   9. Accepting Warranty or Additional Liability. While redistributing
+    9. Accepting Warranty or Additional Liability. While redistributing
       the Work or Derivative Works thereof, You may choose to offer,
       and charge a fee for, acceptance of support, warranty, indemnity,
       or other liability obligations and/or rights consistent with this
@@ -302,9 +173,9 @@ express Statement of Purpose.
       incurred by, or claims asserted against, such Contributor by reason
       of your accepting any such warranty or additional liability.
 
-   END OF TERMS AND CONDITIONS
+    END OF TERMS AND CONDITIONS
 
-   APPENDIX: How to apply the Apache License to your work.
+    APPENDIX: How to apply the Apache License to your work.
 
       To apply the Apache License to your work, attach the following
       boilerplate notice, with the fields enclosed by brackets "[]"
@@ -315,16 +186,34 @@ express Statement of Purpose.
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright 2019 Jack O'Connor and Samuel Neves
+    Copyright 2019 Jack O'Connor and Samuel Neves
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
 
        http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+
+---- LLVM Exceptions to the Apache 2.0 License ----
+
+As an exception, if, as a result of your compiling your source code, portions
+of this Software are embedded into an Object form of such source code, you
+may redistribute such embedded portions in such Object form without complying
+with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
+
+In addition, if you combine or link compiled forms of this Software with
+software that is licensed under the GPLv2 ("Combined Software") and if a
+court of competent jurisdiction determines that the patent provision (Section
+3), the indemnity provision (Section 9) or other Section of the License
+conflicts with the conditions of the GPLv2, you may retroactively and
+prospectively choose to deem waived or otherwise exclude such Section(s) of
+the License, but only in their entirety and only with respect to the Combined
+Software.
+
diff --git a/third-party/blake3/LICENSE_CC0 b/third-party/blake3/LICENSE_CC0
new file mode 100644
index 0000000000..0e259d42c9
--- /dev/null
+++ b/third-party/blake3/LICENSE_CC0
@@ -0,0 +1,121 @@
+Creative Commons Legal Code
+
+CC0 1.0 Universal
+
+    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
+    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
+    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
+    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
+    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
+    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
+    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
+    HEREUNDER.
+
+Statement of Purpose
+
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator
+and subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+
+Certain owners wish to permanently relinquish those rights to a Work for
+the purpose of contributing to a commons of creative, cultural and
+scientific works ("Commons") that the public can reliably and without fear
+of later claims of infringement build upon, modify, incorporate in other
+works, reuse and redistribute as freely as possible in any form whatsoever
+and for any purposes, including without limitation commercial purposes.
+These owners may contribute to the Commons to promote the ideal of a free
+culture and the further production of creative, cultural and scientific
+works, or to gain reputation or greater distribution for their Work in
+part through the use and efforts of others.
+
+For these and/or other purposes and motivations, and without any
+expectation of additional consideration or compensation, the person
+associating CC0 with a Work (the "Affirmer"), to the extent that he or she
+is an owner of Copyright and Related Rights in the Work, voluntarily
+elects to apply CC0 to the Work and publicly distribute the Work under its
+terms, with knowledge of his or her Copyright and Related Rights in the
+Work and the meaning and intended legal effect of CC0 on those rights.
+
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not
+limited to, the following:
+
+  i. the right to reproduce, adapt, distribute, perform, display,
+     communicate, and translate a Work;
+ ii. moral rights retained by the original author(s) and/or performer(s);
+iii. publicity and privacy rights pertaining to a person's image or
+     likeness depicted in a Work;
+ iv. rights protecting against unfair competition in regards to a Work,
+     subject to the limitations in paragraph 4(a), below;
+  v. rights protecting the extraction, dissemination, use and reuse of data
+     in a Work;
+ vi. database rights (such as those arising under Directive 96/9/EC of the
+     European Parliament and of the Council of 11 March 1996 on the legal
+     protection of databases, and under any national implementation
+     thereof, including any amended or successor version of such
+     directive); and
+vii. other similar, equivalent or corresponding rights throughout the
+     world based on applicable law or treaty, and any national
+     implementations thereof.
+
+2. Waiver. To the greatest extent permitted by, but not in contravention
+of, applicable law, Affirmer hereby overtly, fully, permanently,
+irrevocably and unconditionally waives, abandons, and surrenders all of
+Affirmer's Copyright and Related Rights and associated claims and causes
+of action, whether now known or unknown (including existing as well as
+future claims and causes of action), in the Work (i) in all territories
+worldwide, (ii) for the maximum duration provided by applicable law or
+treaty (including future time extensions), (iii) in any current or future
+medium and for any number of copies, and (iv) for any purpose whatsoever,
+including without limitation commercial, advertising or promotional
+purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
+member of the public at large and to the detriment of Affirmer's heirs and
+successors, fully intending that such Waiver shall not be subject to
+revocation, rescission, cancellation, termination, or any other legal or
+equitable action to disrupt the quiet enjoyment of the Work by the public
+as contemplated by Affirmer's express Statement of Purpose.
+
+3. Public License Fallback. Should any part of the Waiver for any reason
+be judged legally invalid or ineffective under applicable law, then the
+Waiver shall be preserved to the maximum extent permitted taking into
+account Affirmer's express Statement of Purpose. In addition, to the
+extent the Waiver is so judged Affirmer hereby grants to each affected
+person a royalty-free, non transferable, non sublicensable, non exclusive,
+irrevocable and unconditional license to exercise Affirmer's Copyright and
+Related Rights in the Work (i) in all territories worldwide, (ii) for the
+maximum duration provided by applicable law or treaty (including future
+time extensions), (iii) in any current or future medium and for any number
+of copies, and (iv) for any purpose whatsoever, including without
+limitation commercial, advertising or promotional purposes (the
+"License"). The License shall be deemed effective as of the date CC0 was
+applied by Affirmer to the Work. Should any part of the License for any
+reason be judged legally invalid or ineffective under applicable law, such
+partial invalidity or ineffectiveness shall not invalidate the remainder
+of the License, and in such case Affirmer hereby affirms that he or she
+will not (i) exercise any of his or her remaining Copyright and Related
+Rights in the Work or (ii) assert any associated claims and causes of
+action with respect to the Work, in either case contrary to Affirmer's
+express Statement of Purpose.
+
+4. Limitations and Disclaimers.
+
+ a. No trademark or patent rights held by Affirmer are waived, abandoned,
+    surrendered, licensed or otherwise affected by this document.
+ b. Affirmer offers the Work as-is and makes no representations or
+    warranties of any kind concerning the Work, express, implied,
+    statutory or otherwise, including without limitation warranties of
+    title, merchantability, fitness for a particular purpose, non
+    infringement, or the absence of latent or other defects, accuracy, or
+    the present or absence of errors, whether or not discoverable, all to
+    the greatest extent permissible under applicable law.
+ c. Affirmer disclaims responsibility for clearing rights of other persons
+    that may apply to the Work or any use thereof, including without
+    limitation any person's Copyright and Related Rights in the Work.
+    Further, Affirmer disclaims responsibility for obtaining any necessary
+    consents, permissions or other rights required for any use of the
+    Work.
+ d. Affirmer understands and acknowledges that Creative Commons is not a
+    party to this document and has no duty or obligation with respect to
+    this CC0 or use of the Work.
diff --git a/third-party/blake3/README.md b/third-party/blake3/README.md
index 6b493775b3..c1ce9613f6 100644
--- a/third-party/blake3/README.md
+++ b/third-party/blake3/README.md
@@ -1,4 +1,4 @@
-# <a href="#"><img src="media/BLAKE3.svg" alt="BLAKE3" height=50></a>
+# BLAKE3
 
 BLAKE3 is a cryptographic hash function that is:
 
@@ -68,13 +68,16 @@ This repository is the official implementation of BLAKE3. It includes:
 
 BLAKE3 was designed by:
 
-* [@oconnor663 ](https://github.com/oconnor663) (Jack O'Connor)
-* [@sneves](https://github.com/sneves) (Samuel Neves)
-* [@veorq](https://github.com/veorq) (Jean-Philippe Aumasson)
-* [@zookozcash](https://github.com/zookozcash) (Zooko)
+* [@oconnor663] (Jack O'Connor)
+* [@sneves] (Samuel Neves)
+* [@veorq] (Jean-Philippe Aumasson)
+* [@zookozcash] (Zooko)
 
 The development of BLAKE3 was sponsored by [Electric Coin Company](https://electriccoin.co).
 
+BLAKE3 is also [specified](https://c2sp.org/BLAKE3) in the [Community
+Cryptography Specification Project (C2SP)](https://c2sp.org).
+
 *NOTE: BLAKE3 is not a password hashing algorithm, because it's
 designed to be fast, whereas password hashing should not be fast. If you
 hash passwords to store the hashes or if you derive keys from passwords,
@@ -187,14 +190,14 @@ bindings](https://github.com/sken77/BLAKE3jni).
 
 Please see [CONTRIBUTING.md](CONTRIBUTING.md).
 
-## Intellectual property
+## Licenses
+
+This work is released into the public domain with [CC0 1.0](./LICENSE_CC0).
+Alternatively, it is licensed under any of the following:
 
-The Rust code is copyright Jack O'Connor, 2019-2020. The C code is
-copyright Samuel Neves and Jack O'Connor, 2019-2020. The assembly code
-is copyright Samuel Neves, 2019-2020.
+* [Apache 2.0](./LICENSE_A2)
+* [Apache 2.0 with LLVM exceptions](./LICENSE_A2LLVM)
 
-This work is released into the public domain with CC0 1.0.
-Alternatively, it is licensed under the Apache License 2.0.
 
 ## Adoption & deployment
 
@@ -218,6 +221,10 @@ Here's a (non-exhaustive) list of protocols and software that use BLAKE3:
 
 ## Miscellany
 
-- [@veorq](https://github.com/veorq) and
-  [@oconnor663](https://github.com/oconnor663) did [a podcast
-  interview](https://www.cryptography.fm/3) about designing BLAKE3.
+- [@veorq] and [@oconnor663] did [an interview with Cryptography FM](https://www.cryptography.fm/3).
+- [@oconnor663] did [an interview with Saito](https://www.youtube.com/watch?v=cJkmIt7yN_E).
+
+[@oconnor663]: https://github.com/oconnor663
+[@sneves]: https://github.com/sneves
+[@veorq]: https://github.com/veorq
+[@zookozcash]: https://github.com/zookozcash
diff --git a/third-party/blake3/b3sum/Cargo.lock b/third-party/blake3/b3sum/Cargo.lock
index 2300d3bf6a..2904481d06 100644
--- a/third-party/blake3/b3sum/Cargo.lock
+++ b/third-party/blake3/b3sum/Cargo.lock
@@ -4,93 +4,94 @@ version = 3
 
 [[package]]
 name = "anstream"
-version = "0.6.13"
+version = "0.6.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d96bd03f33fe50a863e394ee9718a706f988b9079b20c3784fb726e7678b62fb"
+checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b"
 dependencies = [
  "anstyle",
  "anstyle-parse",
  "anstyle-query",
  "anstyle-wincon",
  "colorchoice",
+ "is_terminal_polyfill",
  "utf8parse",
 ]
 
 [[package]]
 name = "anstyle"
-version = "1.0.6"
+version = "1.0.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc"
+checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
 
 [[package]]
 name = "anstyle-parse"
-version = "0.2.3"
+version = "0.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c"
+checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9"
 dependencies = [
  "utf8parse",
 ]
 
 [[package]]
 name = "anstyle-query"
-version = "1.0.2"
+version = "1.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648"
+checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c"
 dependencies = [
- "windows-sys 0.52.0",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
 name = "anstyle-wincon"
-version = "3.0.2"
+version = "3.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7"
+checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125"
 dependencies = [
  "anstyle",
- "windows-sys 0.52.0",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
 name = "anyhow"
-version = "1.0.81"
+version = "1.0.93"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0952808a6c2afd1aa8947271f3a60f1a6763c7b912d210184c5149b5cf147247"
+checksum = "4c95c10ba0b00a02636238b814946408b1322d5ac4760326e6fb8ec956d85775"
 
 [[package]]
 name = "arrayref"
-version = "0.3.7"
+version = "0.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b4930d2cb77ce62f89ee5d5289b4ac049559b1c45539271f5ed4fdc7db34545"
+checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb"
 
 [[package]]
 name = "arrayvec"
-version = "0.7.4"
+version = "0.7.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711"
+checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
 
 [[package]]
 name = "b3sum"
-version = "1.5.1"
+version = "1.5.5"
 dependencies = [
  "anyhow",
  "blake3",
  "clap",
  "duct",
  "hex",
- "rayon",
+ "rayon-core",
  "tempfile",
  "wild",
 ]
 
 [[package]]
 name = "bitflags"
-version = "2.4.2"
+version = "2.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf"
+checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
 
 [[package]]
 name = "blake3"
-version = "1.5.1"
+version = "1.5.5"
 dependencies = [
  "arrayref",
  "arrayvec",
@@ -98,14 +99,17 @@ dependencies = [
  "cfg-if",
  "constant_time_eq",
  "memmap2",
- "rayon",
+ "rayon-core",
 ]
 
 [[package]]
 name = "cc"
-version = "1.0.90"
+version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8cd6604a82acf3039f1144f54b8eb34e91ffba622051189e71b781822d5ee1f5"
+checksum = "fd9de9f2205d5ef3fd67e685b0df337994ddd4495e2a28d185500d0e1edfea47"
+dependencies = [
+ "shlex",
+]
 
 [[package]]
 name = "cfg-if"
@@ -115,9 +119,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
 [[package]]
 name = "clap"
-version = "4.5.2"
+version = "4.5.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b230ab84b0ffdf890d5a10abdbc8b83ae1c4918275daea1ab8801f71536b2651"
+checksum = "fb3b4b9e5a7c7514dfa52869339ee98b3156b0bfb4e8a77c4ff4babb64b1604f"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -125,9 +129,9 @@ dependencies = [
 
 [[package]]
 name = "clap_builder"
-version = "4.5.2"
+version = "4.5.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ae129e2e766ae0ec03484e609954119f123cc1fe650337e155d03b022f24f7b4"
+checksum = "b17a95aa67cc7b5ebd32aa5370189aa0d79069ef1c64ce893bd30fb24bff20ec"
 dependencies = [
  "anstream",
  "anstyle",
@@ -138,9 +142,9 @@ dependencies = [
 
 [[package]]
 name = "clap_derive"
-version = "4.5.0"
+version = "4.5.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "307bc0538d5f0f83b8248db3087aa92fe504e4691294d0c96c0eabc33f47ba47"
+checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
 dependencies = [
  "heck",
  "proc-macro2",
@@ -150,21 +154,21 @@ dependencies = [
 
 [[package]]
 name = "clap_lex"
-version = "0.7.0"
+version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce"
+checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7"
 
 [[package]]
 name = "colorchoice"
-version = "1.0.0"
+version = "1.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7"
+checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
 
 [[package]]
 name = "constant_time_eq"
-version = "0.3.0"
+version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f7144d30dcf0fafbce74250a3963025d8d52177934239851c917d29f1df280c2"
+checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6"
 
 [[package]]
 name = "crossbeam-deque"
@@ -187,9 +191,9 @@ dependencies = [
 
 [[package]]
 name = "crossbeam-utils"
-version = "0.8.19"
+version = "0.8.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
+checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80"
 
 [[package]]
 name = "duct"
@@ -203,17 +207,11 @@ dependencies = [
  "shared_child",
 ]
 
-[[package]]
-name = "either"
-version = "1.10.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a"
-
 [[package]]
 name = "errno"
-version = "0.3.8"
+version = "0.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245"
+checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba"
 dependencies = [
  "libc",
  "windows-sys 0.52.0",
@@ -221,9 +219,9 @@ dependencies = [
 
 [[package]]
 name = "fastrand"
-version = "2.0.1"
+version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5"
+checksum = "486f806e73c5707928240ddc295403b1b93c96a02038563881c4a2fd84b81ac4"
 
 [[package]]
 name = "glob"
@@ -233,9 +231,9 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
 
 [[package]]
 name = "heck"
-version = "0.4.1"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
 
 [[package]]
 name = "hex"
@@ -243,71 +241,67 @@ version = "0.4.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
 
+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
+
 [[package]]
 name = "libc"
-version = "0.2.153"
+version = "0.2.166"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"
+checksum = "c2ccc108bbc0b1331bd061864e7cd823c0cab660bbe6970e66e2c0614decde36"
 
 [[package]]
 name = "linux-raw-sys"
-version = "0.4.13"
+version = "0.4.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
+checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
 
 [[package]]
 name = "memmap2"
-version = "0.9.4"
+version = "0.9.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fe751422e4a8caa417e13c3ea66452215d7d63e19e604f4980461212f3ae1322"
+checksum = "fd3f7eed9d3848f8b98834af67102b720745c4ec028fcd0aa0239277e7de374f"
 dependencies = [
  "libc",
 ]
 
 [[package]]
 name = "once_cell"
-version = "1.19.0"
+version = "1.20.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
+checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
 
 [[package]]
 name = "os_pipe"
-version = "1.1.5"
+version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "57119c3b893986491ec9aa85056780d3a0f3cf4da7cc09dd3650dbd6c6738fb9"
+checksum = "5ffd2b0a5634335b135d5728d84c5e0fd726954b87111f7506a61c502280d982"
 dependencies = [
  "libc",
- "windows-sys 0.52.0",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.79"
+version = "1.0.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e"
+checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
 dependencies = [
  "unicode-ident",
 ]
 
 [[package]]
 name = "quote"
-version = "1.0.35"
+version = "1.0.37"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
+checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
 dependencies = [
  "proc-macro2",
 ]
 
-[[package]]
-name = "rayon"
-version = "1.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e4963ed1bc86e4f3ee217022bd855b297cef07fb9eac5dfa1f788b220b49b3bd"
-dependencies = [
- "either",
- "rayon-core",
-]
-
 [[package]]
 name = "rayon-core"
 version = "1.12.1"
@@ -320,9 +314,9 @@ dependencies = [
 
 [[package]]
 name = "rustix"
-version = "0.38.31"
+version = "0.38.41"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949"
+checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6"
 dependencies = [
  "bitflags",
  "errno",
@@ -333,25 +327,31 @@ dependencies = [
 
 [[package]]
 name = "shared_child"
-version = "1.0.0"
+version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0d94659ad3c2137fef23ae75b03d5241d633f8acded53d672decfa0e6e0caef"
+checksum = "09fa9338aed9a1df411814a5b2252f7cd206c55ae9bf2fa763f8de84603aa60c"
 dependencies = [
  "libc",
- "winapi",
+ "windows-sys 0.59.0",
 ]
 
+[[package]]
+name = "shlex"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
+
 [[package]]
 name = "strsim"
-version = "0.11.0"
+version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5ee073c9e4cd00e28217186dbe12796d692868f432bf2e97ee73bed0c56dfa01"
+checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
 
 [[package]]
 name = "syn"
-version = "2.0.52"
+version = "2.0.89"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07"
+checksum = "44d46482f1c1c87acd84dea20c1bf5ebff4c757009ed6bf19cfd36fb10e92c4e"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -360,37 +360,38 @@ dependencies = [
 
 [[package]]
 name = "tempfile"
-version = "3.10.1"
+version = "3.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1"
+checksum = "28cce251fcbc87fac86a866eeb0d6c2d536fc16d06f184bb61aeae11aa4cee0c"
 dependencies = [
  "cfg-if",
  "fastrand",
+ "once_cell",
  "rustix",
- "windows-sys 0.52.0",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
 name = "terminal_size"
-version = "0.3.0"
+version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7"
+checksum = "4f599bd7ca042cfdf8f4512b277c02ba102247820f9d9d4a9f521f496751a6ef"
 dependencies = [
  "rustix",
- "windows-sys 0.48.0",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
 name = "unicode-ident"
-version = "1.0.12"
+version = "1.0.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
+checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
 
 [[package]]
 name = "utf8parse"
-version = "0.2.1"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
+checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
 
 [[package]]
 name = "wild"
@@ -401,156 +402,84 @@ dependencies = [
  "glob",
 ]
 
-[[package]]
-name = "winapi"
-version = "0.3.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
-dependencies = [
- "winapi-i686-pc-windows-gnu",
- "winapi-x86_64-pc-windows-gnu",
-]
-
-[[package]]
-name = "winapi-i686-pc-windows-gnu"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
-
-[[package]]
-name = "winapi-x86_64-pc-windows-gnu"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
-
-[[package]]
-name = "windows-sys"
-version = "0.48.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
-dependencies = [
- "windows-targets 0.48.5",
-]
-
 [[package]]
 name = "windows-sys"
 version = "0.52.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
 dependencies = [
- "windows-targets 0.52.4",
+ "windows-targets",
 ]
 
 [[package]]
-name = "windows-targets"
-version = "0.48.5"
+name = "windows-sys"
+version = "0.59.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
+checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
 dependencies = [
- "windows_aarch64_gnullvm 0.48.5",
- "windows_aarch64_msvc 0.48.5",
- "windows_i686_gnu 0.48.5",
- "windows_i686_msvc 0.48.5",
- "windows_x86_64_gnu 0.48.5",
- "windows_x86_64_gnullvm 0.48.5",
- "windows_x86_64_msvc 0.48.5",
+ "windows-targets",
 ]
 
 [[package]]
 name = "windows-targets"
-version = "0.52.4"
+version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
 dependencies = [
- "windows_aarch64_gnullvm 0.52.4",
- "windows_aarch64_msvc 0.52.4",
- "windows_i686_gnu 0.52.4",
- "windows_i686_msvc 0.52.4",
- "windows_x86_64_gnu 0.52.4",
- "windows_x86_64_gnullvm 0.52.4",
- "windows_x86_64_msvc 0.52.4",
+ "windows_aarch64_gnullvm",
+ "windows_aarch64_msvc",
+ "windows_i686_gnu",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc",
+ "windows_x86_64_gnu",
+ "windows_x86_64_gnullvm",
+ "windows_x86_64_msvc",
 ]
 
 [[package]]
 name = "windows_aarch64_gnullvm"
-version = "0.48.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
-
-[[package]]
-name = "windows_aarch64_gnullvm"
-version = "0.52.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9"
-
-[[package]]
-name = "windows_aarch64_msvc"
-version = "0.48.5"
+version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
 
 [[package]]
 name = "windows_aarch64_msvc"
-version = "0.52.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675"
-
-[[package]]
-name = "windows_i686_gnu"
-version = "0.48.5"
+version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
 
 [[package]]
 name = "windows_i686_gnu"
-version = "0.52.4"
+version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
 
 [[package]]
-name = "windows_i686_msvc"
-version = "0.48.5"
+name = "windows_i686_gnullvm"
+version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
 
 [[package]]
 name = "windows_i686_msvc"
-version = "0.52.4"
+version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
 
 [[package]]
 name = "windows_x86_64_gnu"
-version = "0.48.5"
+version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
-
-[[package]]
-name = "windows_x86_64_gnu"
-version = "0.52.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03"
-
-[[package]]
-name = "windows_x86_64_gnullvm"
-version = "0.48.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
 
 [[package]]
 name = "windows_x86_64_gnullvm"
-version = "0.52.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177"
-
-[[package]]
-name = "windows_x86_64_msvc"
-version = "0.48.5"
+version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
 
 [[package]]
 name = "windows_x86_64_msvc"
-version = "0.52.4"
+version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
diff --git a/third-party/blake3/b3sum/Cargo.toml b/third-party/blake3/b3sum/Cargo.toml
index 812ed224b2..81eb25e480 100644
--- a/third-party/blake3/b3sum/Cargo.toml
+++ b/third-party/blake3/b3sum/Cargo.toml
@@ -1,10 +1,10 @@
 [package]
 name = "b3sum"
-version = "1.5.1"
+version = "1.5.5"
 authors = ["Jack O'Connor <oconnor663@gmail.com>"]
 description = "a command line implementation of the BLAKE3 hash function"
 repository = "https://github.com/BLAKE3-team/BLAKE3"
-license = "CC0-1.0 OR Apache-2.0"
+license = "CC0-1.0 OR Apache-2.0 OR Apache-2.0 WITH LLVM-exception"
 readme = "README.md"
 edition = "2021"
 
@@ -18,7 +18,7 @@ anyhow = "1.0.25"
 blake3 = { version = "1", path = "..", features = ["mmap", "rayon"] }
 clap = { version = "4.0.8", features = ["derive", "wrap_help"] }
 hex = "0.4.0"
-rayon = "1.2.1"
+rayon-core = "1.12.1"
 wild = "2.0.3"
 
 [dev-dependencies]
diff --git a/third-party/blake3/b3sum/LICENSE_A2 b/third-party/blake3/b3sum/LICENSE_A2
new file mode 120000
index 0000000000..c7b0be82d0
--- /dev/null
+++ b/third-party/blake3/b3sum/LICENSE_A2
@@ -0,0 +1 @@
+../LICENSE_A2
\ No newline at end of file
diff --git a/third-party/blake3/b3sum/LICENSE_A2LLVM b/third-party/blake3/b3sum/LICENSE_A2LLVM
new file mode 120000
index 0000000000..c9f3d16a6d
--- /dev/null
+++ b/third-party/blake3/b3sum/LICENSE_A2LLVM
@@ -0,0 +1 @@
+../LICENSE_A2LLVM
\ No newline at end of file
diff --git a/third-party/blake3/b3sum/LICENSE_CC0 b/third-party/blake3/b3sum/LICENSE_CC0
new file mode 120000
index 0000000000..856562a793
--- /dev/null
+++ b/third-party/blake3/b3sum/LICENSE_CC0
@@ -0,0 +1 @@
+../LICENSE_CC0
\ No newline at end of file
diff --git a/third-party/blake3/b3sum/src/main.rs b/third-party/blake3/b3sum/src/main.rs
index 228737ff02..baa6b6ca27 100644
--- a/third-party/blake3/b3sum/src/main.rs
+++ b/third-party/blake3/b3sum/src/main.rs
@@ -244,8 +244,11 @@ fn filepath_to_string(filepath: &Path) -> FilepathString {
         filepath_string = filepath_string.replace('\\', "/");
     }
     let mut is_escaped = false;
-    if filepath_string.contains('\\') || filepath_string.contains('\n') {
-        filepath_string = filepath_string.replace('\\', "\\\\").replace('\n', "\\n");
+    if filepath_string.contains(['\\', '\n', '\r']) {
+        filepath_string = filepath_string
+            .replace('\\', "\\\\")
+            .replace('\n', "\\n")
+            .replace('\r', "\\r");
         is_escaped = true;
     }
     FilepathString {
@@ -303,6 +306,7 @@ fn unescape(mut path: &str) -> Result<String> {
         match path[i + 1..].chars().next().unwrap() {
             // Anything other than a recognized escape sequence is an error.
             'n' => unescaped.push_str("\n"),
+            'r' => unescaped.push_str("\r"),
             '\\' => unescaped.push_str("\\"),
             _ => bail!("Invalid backslash escape"),
         }
@@ -321,13 +325,11 @@ struct ParsedCheckLine {
 }
 
 fn parse_check_line(mut line: &str) -> Result<ParsedCheckLine> {
-    // Trim off the trailing newline, if any.
-    line = line.trim_end_matches('\n');
+    // Trim off the trailing newlines, if any.
+    line = line.trim_end_matches(['\r', '\n']);
     // If there's a backslash at the front of the line, that means we need to
     // unescape the path below. This matches the behavior of e.g. md5sum.
-    let first = if let Some(c) = line.chars().next() {
-        c
-    } else {
+    let Some(first) = line.chars().next() else {
         bail!("Empty line");
     };
     let mut is_escaped = false;
@@ -475,7 +477,7 @@ fn check_one_checkfile(path: &Path, args: &Args, files_failed: &mut u64) -> Resu
 
 fn main() -> Result<()> {
     let args = Args::parse()?;
-    let mut thread_pool_builder = rayon::ThreadPoolBuilder::new();
+    let mut thread_pool_builder = rayon_core::ThreadPoolBuilder::new();
     if let Some(num_threads) = args.num_threads() {
         thread_pool_builder = thread_pool_builder.num_threads(num_threads);
     }
diff --git a/third-party/blake3/b3sum/src/unit_tests.rs b/third-party/blake3/b3sum/src/unit_tests.rs
index 1fa1a17dc3..b95c4a22fe 100644
--- a/third-party/blake3/b3sum/src/unit_tests.rs
+++ b/third-party/blake3/b3sum/src/unit_tests.rs
@@ -28,13 +28,13 @@ fn test_parse_check_line() {
         file_path,
         expected_hash,
     } = crate::parse_check_line(
-        "fafafafafafafafafafafafafafafafafafafafafafafafafafafafafafafafa  fo \to\n\n\n",
+        "fafafafafafafafafafafafafafafafafafafafafafafafafafafafafafafafa   \t\r\n\n\r \t\r\n\n\r",
     )
     .unwrap();
     assert_eq!(expected_hash, blake3::Hash::from([0xfa; 32]));
     assert!(!is_escaped);
-    assert_eq!(file_string, "fo \to");
-    assert_eq!(file_path, Path::new("fo \to"));
+    assert_eq!(file_string, " \t\r\n\n\r \t");
+    assert_eq!(file_path, Path::new(" \t\r\n\n\r \t"));
 
     // path is one space
     let crate::ParsedCheckLine {
@@ -71,20 +71,20 @@ fn test_parse_check_line() {
         assert_eq!(file_path, Path::new("fo\\a\\no"));
     }
 
-    // escaped newline
+    // escaped newlines
     let crate::ParsedCheckLine {
         file_string,
         is_escaped,
         file_path,
         expected_hash,
     } = crate::parse_check_line(
-        "\\4444444444444444444444444444444444444444444444444444444444444444  fo\\n\\no",
+        "\\4444444444444444444444444444444444444444444444444444444444444444  fo\\r\\n\\n\\ro",
     )
     .unwrap();
     assert_eq!(expected_hash, blake3::Hash::from([0x44; 32]));
     assert!(is_escaped);
-    assert_eq!(file_string, "fo\\n\\no");
-    assert_eq!(file_path, Path::new("fo\n\no"));
+    assert_eq!(file_string, "fo\\r\\n\\n\\ro");
+    assert_eq!(file_path, Path::new("fo\r\n\n\ro"));
 
     // Escaped newline and backslash. Again because backslash is not allowed on
     // Windows, this test is Unix-only.
@@ -187,3 +187,19 @@ fn test_parse_check_line() {
         .unwrap_err();
     }
 }
+
+#[test]
+fn test_filepath_to_string() {
+    let output = crate::filepath_to_string(Path::new("foo"));
+    assert_eq!(output.filepath_string, "foo");
+    assert!(!output.is_escaped);
+
+    let output = crate::filepath_to_string(Path::new("f\\ \t\r\noo"));
+    if cfg!(windows) {
+        // We normalize backslashes to forward slashes on Windows.
+        assert_eq!(output.filepath_string, "f/ \t\\r\\noo");
+    } else {
+        assert_eq!(output.filepath_string, "f\\\\ \t\\r\\noo");
+    }
+    assert!(output.is_escaped);
+}
diff --git a/third-party/blake3/b3sum/tests/cli_tests.rs b/third-party/blake3/b3sum/tests/cli_tests.rs
index d5d4efa3ed..f501ca6fd6 100644
--- a/third-party/blake3/b3sum/tests/cli_tests.rs
+++ b/third-party/blake3/b3sum/tests/cli_tests.rs
@@ -235,8 +235,8 @@ fn test_newline_and_backslash_escaping_on_unix() {
 {0}  abcdef
 \\{0}  abc\\ndef
 \\{0}  abc\\\\def
-{0}  abc\rdef
-\\{0}  abc\r\\ndef
+\\{0}  abc\\rdef
+\\{0}  abc\\r\\ndef
 {0}  subdir/foo",
         empty_hash,
     );
diff --git a/third-party/blake3/benches/bench.rs b/third-party/blake3/benches/bench.rs
index 5efb9e6f5d..3c32e51882 100644
--- a/third-party/blake3/benches/bench.rs
+++ b/third-party/blake3/benches/bench.rs
@@ -515,3 +515,101 @@ fn bench_two_updates(b: &mut Bencher) {
         hasher.finalize()
     });
 }
+
+fn bench_xof(b: &mut Bencher, len: usize) {
+    b.bytes = len as u64;
+    let mut output = [0u8; 64 * BLOCK_LEN];
+    let output_slice = &mut output[..len];
+    let mut xof = blake3::Hasher::new().finalize_xof();
+    b.iter(|| xof.fill(output_slice));
+}
+
+#[bench]
+fn bench_xof_01_block(b: &mut Bencher) {
+    bench_xof(b, 1 * BLOCK_LEN);
+}
+
+#[bench]
+fn bench_xof_02_blocks(b: &mut Bencher) {
+    bench_xof(b, 2 * BLOCK_LEN);
+}
+
+#[bench]
+fn bench_xof_03_blocks(b: &mut Bencher) {
+    bench_xof(b, 3 * BLOCK_LEN);
+}
+
+#[bench]
+fn bench_xof_04_blocks(b: &mut Bencher) {
+    bench_xof(b, 4 * BLOCK_LEN);
+}
+
+#[bench]
+fn bench_xof_05_blocks(b: &mut Bencher) {
+    bench_xof(b, 5 * BLOCK_LEN);
+}
+
+#[bench]
+fn bench_xof_06_blocks(b: &mut Bencher) {
+    bench_xof(b, 6 * BLOCK_LEN);
+}
+
+#[bench]
+fn bench_xof_07_blocks(b: &mut Bencher) {
+    bench_xof(b, 7 * BLOCK_LEN);
+}
+
+#[bench]
+fn bench_xof_08_blocks(b: &mut Bencher) {
+    bench_xof(b, 8 * BLOCK_LEN);
+}
+
+#[bench]
+fn bench_xof_09_blocks(b: &mut Bencher) {
+    bench_xof(b, 9 * BLOCK_LEN);
+}
+
+#[bench]
+fn bench_xof_10_blocks(b: &mut Bencher) {
+    bench_xof(b, 10 * BLOCK_LEN);
+}
+
+#[bench]
+fn bench_xof_11_blocks(b: &mut Bencher) {
+    bench_xof(b, 11 * BLOCK_LEN);
+}
+
+#[bench]
+fn bench_xof_12_blocks(b: &mut Bencher) {
+    bench_xof(b, 12 * BLOCK_LEN);
+}
+
+#[bench]
+fn bench_xof_13_blocks(b: &mut Bencher) {
+    bench_xof(b, 13 * BLOCK_LEN);
+}
+
+#[bench]
+fn bench_xof_14_blocks(b: &mut Bencher) {
+    bench_xof(b, 14 * BLOCK_LEN);
+}
+
+#[bench]
+fn bench_xof_15_blocks(b: &mut Bencher) {
+    bench_xof(b, 15 * BLOCK_LEN);
+}
+
+#[bench]
+fn bench_xof_16_blocks(b: &mut Bencher) {
+    bench_xof(b, 16 * BLOCK_LEN);
+}
+
+#[bench]
+fn bench_xof_32_blocks(b: &mut Bencher) {
+    bench_xof(b, 32 * BLOCK_LEN);
+}
+
+#[bench]
+fn bench_xof_64_blocks(b: &mut Bencher) {
+    bench_xof(b, 64 * BLOCK_LEN);
+}
diff --git a/third-party/blake3/build.rs b/third-party/blake3/build.rs
index a5dfd0625d..57f72b7643 100644
--- a/third-party/blake3/build.rs
+++ b/third-party/blake3/build.rs
@@ -74,7 +74,7 @@ fn is_big_endian() -> bool {
     endianness() == "big"
 }
 
-// Windows targets may be using the MSVC toolchain or the GNU toolchain. The
+// Windows targets may be using the MSVC toolchain or the MinGW toolchain. The
 // right compiler flags to use depend on the toolchain. (And we don't want to
 // use flag_if_supported, because we don't want features to be silently
 // disabled by old compilers.)
@@ -85,11 +85,15 @@ fn is_windows_msvc() -> bool {
         && target_components()[3] == "msvc"
 }
 
+// MinGW toolchain uses 2 different targets depending on the main compiler.
+// Target for a general MinGW toolchain ends with `-gnu` (GCC is used as C
+// compiler). Target for a LLVM-MinGW toolchain (Clang is used as C compiler)
+// ends with `-gnullvm`.
 fn is_windows_gnu() -> bool {
     // Some targets are only two components long, so check in steps.
     target_components()[1] == "pc"
         && target_components()[2] == "windows"
-        && target_components()[3] == "gnu"
+        && target_components()[3] != "msvc"
 }
 
 fn new_build() -> cc::Build {
@@ -97,6 +101,11 @@ fn new_build() -> cc::Build {
     if !is_windows_msvc() {
         build.flag("-std=c11");
     }
+    // Do NOT trigger a rebuild any time the env changes (e.g. $PATH).
+    // This prevents all downstream crates from being rebuilt when `cargo check`
+    // or `cargo build` are run in different environments, like Rust Analyzer
+    // vs. in the terminal vs. in a Git pre-commit hook.
+    build.emit_rerun_if_env_changed(false);
     build
 }
 
@@ -240,6 +249,23 @@ fn build_neon_c_intrinsics() {
 }
 
 fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // As of Rust 1.80, unrecognized config names are warnings. Give Cargo all of our config names.
+    let all_cfgs = [
+        "blake3_sse2_ffi",
+        "blake3_sse2_rust",
+        "blake3_sse41_ffi",
+        "blake3_sse41_rust",
+        "blake3_avx2_ffi",
+        "blake3_avx2_rust",
+        "blake3_avx512_ffi",
+        "blake3_neon",
+    ];
+    for cfg_name in all_cfgs {
+        // TODO: Switch this whole file to the new :: syntax when our MSRV reaches 1.77.
+        // https://doc.rust-lang.org/cargo/reference/build-scripts.html#outputs-of-the-build-script
+        println!("cargo:rustc-check-cfg=cfg({cfg_name}, values(none()))");
+    }
+
     if is_pure() && is_neon() {
         panic!("It doesn't make sense to enable both \"pure\" and \"neon\".");
     }
diff --git a/third-party/blake3/c/.gitignore b/third-party/blake3/c/.gitignore
index 3d4b7041a9..ff52a8037c 100644
--- a/third-party/blake3/c/.gitignore
+++ b/third-party/blake3/c/.gitignore
@@ -2,3 +2,5 @@ blake3
 example
 build/
 *.o
+
+CMakeUserPresets.json
diff --git a/third-party/blake3/c/CMakeLists.txt b/third-party/blake3/c/CMakeLists.txt
index 3a3b232dcb..ebcca1dbb0 100644
--- a/third-party/blake3/c/CMakeLists.txt
+++ b/third-party/blake3/c/CMakeLists.txt
@@ -4,9 +4,13 @@ cmake_minimum_required(VERSION 3.9 FATAL_ERROR)
 if (POLICY CMP0128)
   cmake_policy(SET CMP0128 NEW)
 endif()
+# mark_as_advanced does not implicitly create UNINITIALIZED cache entries
+if (POLICY CMP0102)
+  cmake_policy(SET CMP0102 NEW)
+endif()
 
 project(libblake3
-  VERSION 1.5.1
+  VERSION 1.5.5
   DESCRIPTION "BLAKE3 C implementation"
   LANGUAGES C ASM
 )
@@ -26,6 +30,13 @@ if(MSVC)
   set(BLAKE3_CFLAGS_AVX2 "/arch:AVX2" CACHE STRING "the compiler flags to enable AVX2")
   set(BLAKE3_CFLAGS_AVX512 "/arch:AVX512" CACHE STRING "the compiler flags to enable AVX512")
 
+  set(BLAKE3_AMD64_ASM_SOURCES
+    blake3_avx2_x86-64_windows_msvc.asm
+    blake3_avx512_x86-64_windows_msvc.asm
+    blake3_sse2_x86-64_windows_msvc.asm
+    blake3_sse41_x86-64_windows_msvc.asm
+  )
+
 elseif(CMAKE_C_COMPILER_ID STREQUAL "GNU"
        OR CMAKE_C_COMPILER_ID STREQUAL "Clang"
        OR CMAKE_C_COMPILER_ID STREQUAL "AppleClang")
@@ -34,6 +45,23 @@ elseif(CMAKE_C_COMPILER_ID STREQUAL "GNU"
   set(BLAKE3_CFLAGS_AVX2 "-mavx2" CACHE STRING "the compiler flags to enable AVX2")
   set(BLAKE3_CFLAGS_AVX512 "-mavx512f -mavx512vl" CACHE STRING "the compiler flags to enable AVX512")
 
+  if (WIN32)
+    set(BLAKE3_AMD64_ASM_SOURCES
+      blake3_avx2_x86-64_windows_gnu.S
+      blake3_avx512_x86-64_windows_gnu.S
+      blake3_sse2_x86-64_windows_gnu.S
+      blake3_sse41_x86-64_windows_gnu.S
+    )
+
+  elseif(UNIX)
+    set(BLAKE3_AMD64_ASM_SOURCES
+      blake3_avx2_x86-64_unix.S
+      blake3_avx512_x86-64_unix.S
+      blake3_sse2_x86-64_unix.S
+      blake3_sse41_x86-64_unix.S
+    )
+  endif()
+
   if (CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES
       AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
     # 32-bit ARMv8 needs NEON to be enabled explicitly
@@ -41,6 +69,47 @@ elseif(CMAKE_C_COMPILER_ID STREQUAL "GNU"
   endif()
 endif()
 
+mark_as_advanced(BLAKE3_CFLAGS_SSE2 BLAKE3_CFLAGS_SSE4.1 BLAKE3_CFLAGS_AVX2 BLAKE3_CFLAGS_AVX512 BLAKE3_CFLAGS_NEON)
+mark_as_advanced(BLAKE3_AMD64_ASM_SOURCES)
+
+message(STATUS "BLAKE3 SIMD configuration: ${CMAKE_C_COMPILER_ARCHITECTURE_ID}")
+if(MSVC AND DEFINED CMAKE_C_COMPILER_ARCHITECTURE_ID)
+  if(CMAKE_C_COMPILER_ARCHITECTURE_ID MATCHES "[Xx]86")
+    set(BLAKE3_SIMD_TYPE "x86-intrinsics" CACHE STRING "the SIMD acceleration type to use")
+
+  elseif(CMAKE_C_COMPILER_ARCHITECTURE_ID MATCHES "[Xx]64")
+    set(BLAKE3_SIMD_TYPE "amd64-asm" CACHE STRING "the SIMD acceleration type to use")
+
+  elseif(CMAKE_C_COMPILER_ARCHITECTURE_ID MATCHES "[Aa][Rr][Mm]64")
+    set(BLAKE3_SIMD_TYPE "neon-intrinsics" CACHE STRING "the SIMD acceleration type to use")
+
+  else()
+    set(BLAKE3_SIMD_TYPE "none" CACHE STRING "the SIMD acceleration type to use")
+  endif()
+
+elseif(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_AMD64_NAMES)
+  set(BLAKE3_SIMD_TYPE "amd64-asm" CACHE STRING "the SIMD acceleration type to use")
+
+elseif(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_X86_NAMES
+       AND DEFINED BLAKE3_CFLAGS_SSE2
+       AND DEFINED BLAKE3_CFLAGS_SSE4.1
+       AND DEFINED BLAKE3_CFLAGS_AVX2
+       AND DEFINED BLAKE3_CFLAGS_AVX512)
+  set(BLAKE3_SIMD_TYPE "x86-intrinsics" CACHE STRING "the SIMD acceleration type to use")
+
+elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES
+          OR ANDROID_ABI STREQUAL "armeabi-v7a"
+          OR BLAKE3_USE_NEON_INTRINSICS)
+        AND (DEFINED BLAKE3_CFLAGS_NEON
+          OR CMAKE_SIZEOF_VOID_P EQUAL 8))
+  set(BLAKE3_SIMD_TYPE "neon-intrinsics" CACHE STRING "the SIMD acceleration type to use")
+
+else()
+  set(BLAKE3_SIMD_TYPE "none" CACHE STRING "the SIMD acceleration type to use")
+endif()
+
+mark_as_advanced(BLAKE3_SIMD_TYPE)
+
 # library target
 add_library(blake3
   blake3.c
@@ -76,63 +145,25 @@ if (NOT POLICY CMP0128 AND NOT DEFINED CMAKE_C_STANDARD)
 endif()
 
 # optional SIMD sources
-macro(BLAKE3_DISABLE_SIMD)
-  set(BLAKE3_SIMD_AMD64_ASM OFF)
-  set(BLAKE3_SIMD_X86_INTRINSICS OFF)
-  set(BLAKE3_SIMD_NEON_INTRINSICS OFF)
-  target_compile_definitions(blake3 PRIVATE
-    BLAKE3_USE_NEON=0
-    BLAKE3_NO_SSE2
-    BLAKE3_NO_SSE41
-    BLAKE3_NO_AVX2
-    BLAKE3_NO_AVX512
-  )
-endmacro()
-
-if(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_AMD64_NAMES OR BLAKE3_USE_AMD64_ASM)
+if(BLAKE3_SIMD_TYPE STREQUAL "amd64-asm")
+  if (NOT DEFINED BLAKE3_AMD64_ASM_SOURCES)
+    message(FATAL_ERROR "BLAKE3_SIMD_TYPE is set to 'amd64-asm' but no assembly sources are available for the target architecture.")
+  endif()
   set(BLAKE3_SIMD_AMD64_ASM ON)
 
   if(MSVC)
     enable_language(ASM_MASM)
-    target_sources(blake3 PRIVATE
-      blake3_avx2_x86-64_windows_msvc.asm
-      blake3_avx512_x86-64_windows_msvc.asm
-      blake3_sse2_x86-64_windows_msvc.asm
-      blake3_sse41_x86-64_windows_msvc.asm
-    )
+  endif()
 
-  elseif(CMAKE_C_COMPILER_ID STREQUAL "GNU"
-         OR CMAKE_C_COMPILER_ID STREQUAL "Clang"
-         OR CMAKE_C_COMPILER_ID STREQUAL "AppleClang")
-    if (WIN32)
-      target_sources(blake3 PRIVATE
-        blake3_avx2_x86-64_windows_gnu.S
-        blake3_avx512_x86-64_windows_gnu.S
-        blake3_sse2_x86-64_windows_gnu.S
-        blake3_sse41_x86-64_windows_gnu.S
-      )
-
-    elseif(UNIX)
-      target_sources(blake3 PRIVATE
-        blake3_avx2_x86-64_unix.S
-        blake3_avx512_x86-64_unix.S
-        blake3_sse2_x86-64_unix.S
-        blake3_sse41_x86-64_unix.S
-      )
-
-    else()
-      BLAKE3_DISABLE_SIMD()
-    endif()
+  target_sources(blake3 PRIVATE ${BLAKE3_AMD64_ASM_SOURCES})
 
-  else()
-    BLAKE3_DISABLE_SIMD()
+elseif(BLAKE3_SIMD_TYPE STREQUAL "x86-intrinsics")
+  if (NOT DEFINED BLAKE3_CFLAGS_SSE2
+      OR NOT DEFINED BLAKE3_CFLAGS_SSE4.1
+      OR NOT DEFINED BLAKE3_CFLAGS_AVX2
+      OR NOT DEFINED BLAKE3_CFLAGS_AVX512)
+    message(FATAL_ERROR "BLAKE3_SIMD_TYPE is set to 'x86-intrinsics' but no compiler flags are available for the target architecture.")
   endif()
-
-elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_X86_NAMES OR BLAKE3_USE_X86_INTRINSICS)
-       AND DEFINED BLAKE3_CFLAGS_SSE2
-       AND DEFINED BLAKE3_CFLAGS_SSE4.1
-       AND DEFINED BLAKE3_CFLAGS_AVX2
-       AND DEFINED BLAKE3_CFLAGS_AVX512)
   set(BLAKE3_SIMD_X86_INTRINSICS ON)
 
   target_sources(blake3 PRIVATE
@@ -146,11 +177,7 @@ elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_X86_NAMES OR BLAKE3_USE_X86_INTRIN
   set_source_files_properties(blake3_sse2.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE2}")
   set_source_files_properties(blake3_sse41.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE4.1}")
 
-elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES
-          OR ANDROID_ABI STREQUAL "armeabi-v7a"
-          OR BLAKE3_USE_NEON_INTRINSICS)
-        AND (DEFINED BLAKE3_CFLAGS_NEON
-          OR CMAKE_SIZEOF_VOID_P EQUAL 8))
+elseif(BLAKE3_SIMD_TYPE STREQUAL "neon-intrinsics")
   set(BLAKE3_SIMD_NEON_INTRINSICS ON)
 
   target_sources(blake3 PRIVATE
@@ -164,8 +191,17 @@ elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES
     set_source_files_properties(blake3_neon.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_NEON}")
   endif()
 
+elseif(BLAKE3_SIMD_TYPE STREQUAL "none")
+  target_compile_definitions(blake3 PRIVATE
+    BLAKE3_USE_NEON=0
+    BLAKE3_NO_SSE2
+    BLAKE3_NO_SSE41
+    BLAKE3_NO_AVX2
+    BLAKE3_NO_AVX512
+  )
+
 else()
-  BLAKE3_DISABLE_SIMD()
+  message(FATAL_ERROR "BLAKE3_SIMD_TYPE is set to an unknown value: '${BLAKE3_SIMD_TYPE}'")
 endif()
 
 # cmake install support
@@ -193,11 +229,37 @@ install(FILES
     DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/blake3"
 )
 
+# Function for joining paths known from most languages
+#
+# SPDX-License-Identifier: (MIT OR CC0-1.0)
+# Copyright 2020 Jan Tojnar
+# https://github.com/jtojnar/cmake-snips
+#
+# Modelled after Python’s os.path.join
+# https://docs.python.org/3.7/library/os.path.html#os.path.join
+# Windows not supported
+function(join_paths joined_path first_path_segment)
+    set(temp_path "${first_path_segment}")
+    foreach(current_segment IN LISTS ARGN)
+        if(NOT ("${current_segment}" STREQUAL ""))
+            if(IS_ABSOLUTE "${current_segment}")
+                set(temp_path "${current_segment}")
+            else()
+                set(temp_path "${temp_path}/${current_segment}")
+            endif()
+        endif()
+    endforeach()
+    set(${joined_path} "${temp_path}" PARENT_SCOPE)
+endfunction()
+
+join_paths(PKG_CONFIG_INSTALL_LIBDIR "\${prefix}" "${CMAKE_INSTALL_LIBDIR}")
+join_paths(PKG_CONFIG_INSTALL_INCLUDEDIR "\${prefix}" "${CMAKE_INSTALL_INCLUDEDIR}")
 configure_file(libblake3.pc.in libblake3.pc @ONLY)
 install(FILES "${CMAKE_BINARY_DIR}/libblake3.pc"
   DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
 
 # print feature summary
+# add_feature_info cannot directly use the BLAKE3_SIMD_TYPE :(
 add_feature_info("AMD64 assembly" BLAKE3_SIMD_AMD64_ASM "The library uses hand written amd64 SIMD assembly.")
 add_feature_info("x86 SIMD intrinsics" BLAKE3_SIMD_X86_INTRINSICS "The library uses x86 SIMD intrinsics.")
 add_feature_info("NEON SIMD intrinsics" BLAKE3_SIMD_NEON_INTRINSICS "The library uses NEON SIMD intrinsics.")
diff --git a/third-party/blake3/c/CMakePresets.json b/third-party/blake3/c/CMakePresets.json
new file mode 100644
index 0000000000..ffb35db0bd
--- /dev/null
+++ b/third-party/blake3/c/CMakePresets.json
@@ -0,0 +1,73 @@
+{
+    "version": 3,
+    "cmakeMinimumRequired": {
+        "major": 3,
+        "minor": 22,
+        "patch": 0
+    },
+    "configurePresets": [
+        {
+            "name": "base",
+            "hidden": true,
+            "binaryDir": "${sourceDir}/build/${presetName}"
+        },
+        {
+            "name": "msvc",
+            "hidden": true,
+            "generator": "Visual Studio 17 2022",
+            "vendor": {
+                "microsoft.com/VisualStudioSettings/CMake/1.0": {
+                    "hostOS": [
+                        "Windows"
+                    ]
+                }
+            }
+        },
+        {
+            "name": "x64-windows-msvc",
+            "inherits": [
+                "msvc",
+                "base"
+            ],
+            "architecture": "x64"
+        },
+        {
+            "name": "x86-windows-msvc",
+            "inherits": [
+                "msvc",
+                "base"
+            ],
+            "architecture": "Win32"
+        },
+        {
+            "name": "arm64-windows-msvc",
+            "inherits": [
+                "msvc",
+                "base"
+            ],
+            "architecture": "ARM64"
+        }
+    ],
+    "buildPresets": [
+        {
+            "name": "x64-windows-msvc-debug",
+            "configurePreset": "x64-windows-msvc",
+            "configuration": "Debug"
+        },
+        {
+            "name": "x64-windows-msvc-release",
+            "configurePreset": "x64-windows-msvc",
+            "configuration": "RelWithDebInfo"
+        },
+        {
+            "name": "x86-windows-msvc-debug",
+            "configurePreset": "x86-windows-msvc",
+            "configuration": "Debug"
+        },
+        {
+            "name": "x86-windows-msvc-release",
+            "configurePreset": "x86-windows-msvc",
+            "configuration": "RelWithDebInfo"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/third-party/blake3/c/blake3.c b/third-party/blake3/c/blake3.c
index 1b44c71934..7e6d01ec5c 100644
--- a/third-party/blake3/c/blake3.c
+++ b/third-party/blake3/c/blake3.c
@@ -88,24 +88,30 @@ INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) {
 
 INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out,
                               size_t out_len) {
+  if (out_len == 0) {
+      return;
+  }
   uint64_t output_block_counter = seek / 64;
   size_t offset_within_block = seek % 64;
   uint8_t wide_buf[64];
-  while (out_len > 0) {
-    blake3_compress_xof(self->input_cv, self->block, self->block_len,
-                        output_block_counter, self->flags | ROOT, wide_buf);
-    size_t available_bytes = 64 - offset_within_block;
-    size_t memcpy_len;
-    if (out_len > available_bytes) {
-      memcpy_len = available_bytes;
-    } else {
-      memcpy_len = out_len;
-    }
-    memcpy(out, wide_buf + offset_within_block, memcpy_len);
-    out += memcpy_len;
-    out_len -= memcpy_len;
+  if(offset_within_block) {
+    blake3_compress_xof(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf);
+    const size_t available_bytes = 64 - offset_within_block;
+    const size_t bytes = out_len > available_bytes ? available_bytes : out_len;
+    memcpy(out, wide_buf + offset_within_block, bytes);
+    out += bytes;
+    out_len -= bytes;
     output_block_counter += 1;
-    offset_within_block = 0;
+  }
+  if(out_len / 64) {
+    blake3_xof_many(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, out, out_len / 64);
+  }
+  output_block_counter += out_len / 64;
+  out += out_len & -64;
+  out_len -= out_len & -64;
+  if(out_len) {
+    blake3_compress_xof(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf);
+    memcpy(out, wide_buf, out_len);
   }
 }
 
@@ -134,9 +140,7 @@ INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input,
     input_len -= BLAKE3_BLOCK_LEN;
   }
 
-  size_t take = chunk_state_fill_buf(self, input, input_len);
-  input += take;
-  input_len -= take;
+  chunk_state_fill_buf(self, input, input_len);
 }
 
 INLINE output_t chunk_state_output(const blake3_chunk_state *self) {
@@ -430,7 +434,7 @@ INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) {
 //    of the whole tree, and it would need to be ROOT finalized. We can't
 //    compress it until we know.
 // 2) This 64 KiB input might complete a larger tree, whose root node is
-//    similarly going to be the the root of the whole tree. For example, maybe
+//    similarly going to be the root of the whole tree. For example, maybe
 //    we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the
 //    node at the root of the 256 KiB subtree until we know how to finalize it.
 //
diff --git a/third-party/blake3/c/blake3.h b/third-party/blake3/c/blake3.h
index 48284e5089..9ed62f97e2 100644
--- a/third-party/blake3/c/blake3.h
+++ b/third-party/blake3/c/blake3.h
@@ -30,7 +30,7 @@
 extern "C" {
 #endif
 
-#define BLAKE3_VERSION_STRING "1.5.1"
+#define BLAKE3_VERSION_STRING "1.5.5"
 #define BLAKE3_KEY_LEN 32
 #define BLAKE3_OUT_LEN 32
 #define BLAKE3_BLOCK_LEN 64
diff --git a/third-party/blake3/c/blake3_avx512.c b/third-party/blake3/c/blake3_avx512.c
index d6b1ae9b18..f88a32d312 100644
--- a/third-party/blake3/c/blake3_avx512.c
+++ b/third-party/blake3/c/blake3_avx512.c
@@ -7,23 +7,27 @@
       _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
 
 INLINE __m128i loadu_128(const uint8_t src[16]) {
-  return _mm_loadu_si128((const __m128i *)src);
+  return _mm_loadu_si128((void*)src);
 }
 
 INLINE __m256i loadu_256(const uint8_t src[32]) {
-  return _mm256_loadu_si256((const __m256i *)src);
+  return _mm256_loadu_si256((void*)src);
 }
 
 INLINE __m512i loadu_512(const uint8_t src[64]) {
-  return _mm512_loadu_si512((const __m512i *)src);
+  return _mm512_loadu_si512((void*)src);
 }
 
 INLINE void storeu_128(__m128i src, uint8_t dest[16]) {
-  _mm_storeu_si128((__m128i *)dest, src);
+  _mm_storeu_si128((void*)dest, src);
 }
 
 INLINE void storeu_256(__m256i src, uint8_t dest[16]) {
-  _mm256_storeu_si256((__m256i *)dest, src);
+  _mm256_storeu_si256((void*)dest, src);
+}
+
+INLINE void storeu_512(__m512i src, uint8_t dest[16]) {
+  _mm512_storeu_si512((void*)dest, src);
 }
 
 INLINE __m128i add_128(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
@@ -550,6 +554,54 @@ void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks,
   storeu_128(h_vecs[7], &out[7 * sizeof(__m128i)]);
 }
 
+static
+void blake3_xof4_avx512(const uint32_t cv[8],
+                        const uint8_t block[BLAKE3_BLOCK_LEN],
+                        uint8_t block_len, uint64_t counter, uint8_t flags,
+                        uint8_t out[4 * 64]) {
+  __m128i h_vecs[8] = {
+      set1_128(cv[0]), set1_128(cv[1]), set1_128(cv[2]), set1_128(cv[3]),
+      set1_128(cv[4]), set1_128(cv[5]), set1_128(cv[6]), set1_128(cv[7]),
+  };
+  uint32_t block_words[16];
+  load_block_words(block, block_words);
+  __m128i msg_vecs[16];
+  for (size_t i = 0; i < 16; i++) {
+      msg_vecs[i] = set1_128(block_words[i]);
+  }
+  __m128i counter_low_vec, counter_high_vec;
+  load_counters4(counter, true, &counter_low_vec, &counter_high_vec);
+  __m128i block_len_vec = set1_128(block_len);
+  __m128i block_flags_vec = set1_128(flags);
+  __m128i v[16] = {
+      h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
+      h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
+      set1_128(IV[0]), set1_128(IV[1]),  set1_128(IV[2]), set1_128(IV[3]),
+      counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
+  };
+  round_fn4(v, msg_vecs, 0);
+  round_fn4(v, msg_vecs, 1);
+  round_fn4(v, msg_vecs, 2);
+  round_fn4(v, msg_vecs, 3);
+  round_fn4(v, msg_vecs, 4);
+  round_fn4(v, msg_vecs, 5);
+  round_fn4(v, msg_vecs, 6);
+  for (size_t i = 0; i < 8; i++) {
+      v[i] = xor_128(v[i], v[i+8]);
+      v[i+8] = xor_128(v[i+8], h_vecs[i]);
+  }
+  transpose_vecs_128(&v[0]);
+  transpose_vecs_128(&v[4]);
+  transpose_vecs_128(&v[8]);
+  transpose_vecs_128(&v[12]);
+  for (size_t i = 0; i < 4; i++) {
+      storeu_128(v[i+ 0], &out[(4*i+0) * sizeof(__m128i)]);
+      storeu_128(v[i+ 4], &out[(4*i+1) * sizeof(__m128i)]);
+      storeu_128(v[i+ 8], &out[(4*i+2) * sizeof(__m128i)]);
+      storeu_128(v[i+12], &out[(4*i+3) * sizeof(__m128i)]);
+  }
+}
+
 /*
  * ----------------------------------------------------------------------------
  * hash8_avx512
@@ -802,6 +854,50 @@ void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks,
   storeu_256(h_vecs[7], &out[7 * sizeof(__m256i)]);
 }
 
+static
+void blake3_xof8_avx512(const uint32_t cv[8],
+                        const uint8_t block[BLAKE3_BLOCK_LEN],
+                        uint8_t block_len, uint64_t counter, uint8_t flags,
+                        uint8_t out[8 * 64]) {
+  __m256i h_vecs[8] = {
+      set1_256(cv[0]), set1_256(cv[1]), set1_256(cv[2]), set1_256(cv[3]),
+      set1_256(cv[4]), set1_256(cv[5]), set1_256(cv[6]), set1_256(cv[7]),
+  };
+  uint32_t block_words[16];
+  load_block_words(block, block_words);
+  __m256i msg_vecs[16];
+  for (size_t i = 0; i < 16; i++) {
+      msg_vecs[i] = set1_256(block_words[i]);
+  }
+  __m256i counter_low_vec, counter_high_vec;
+  load_counters8(counter, true, &counter_low_vec, &counter_high_vec);
+  __m256i block_len_vec = set1_256(block_len);
+  __m256i block_flags_vec = set1_256(flags);
+  __m256i v[16] = {
+      h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
+      h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
+      set1_256(IV[0]), set1_256(IV[1]),  set1_256(IV[2]), set1_256(IV[3]),
+      counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
+  };
+  round_fn8(v, msg_vecs, 0);
+  round_fn8(v, msg_vecs, 1);
+  round_fn8(v, msg_vecs, 2);
+  round_fn8(v, msg_vecs, 3);
+  round_fn8(v, msg_vecs, 4);
+  round_fn8(v, msg_vecs, 5);
+  round_fn8(v, msg_vecs, 6);
+  for (size_t i = 0; i < 8; i++) {
+      v[i] = xor_256(v[i], v[i+8]);
+      v[i+8] = xor_256(v[i+8], h_vecs[i]);
+  }
+  transpose_vecs_256(&v[0]);
+  transpose_vecs_256(&v[8]);
+  for (size_t i = 0; i < 8; i++) {
+      storeu_256(v[i+0], &out[(2*i+0) * sizeof(__m256i)]);
+      storeu_256(v[i+8], &out[(2*i+1) * sizeof(__m256i)]);
+  }
+}
+
 /*
  * ----------------------------------------------------------------------------
  * hash16_avx512
@@ -1146,6 +1242,48 @@ void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
   _mm256_mask_storeu_epi32(&out[15 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15]));
 }
 
+static
+void blake3_xof16_avx512(const uint32_t cv[8],
+                        const uint8_t block[BLAKE3_BLOCK_LEN],
+                        uint8_t block_len, uint64_t counter, uint8_t flags,
+                        uint8_t out[16 * 64]) {
+  __m512i h_vecs[8] = {
+      set1_512(cv[0]), set1_512(cv[1]), set1_512(cv[2]), set1_512(cv[3]),
+      set1_512(cv[4]), set1_512(cv[5]), set1_512(cv[6]), set1_512(cv[7]),
+  };
+  uint32_t block_words[16];
+  load_block_words(block, block_words);
+  __m512i msg_vecs[16];
+  for (size_t i = 0; i < 16; i++) {
+      msg_vecs[i] = set1_512(block_words[i]);
+  }
+  __m512i counter_low_vec, counter_high_vec;
+  load_counters16(counter, true, &counter_low_vec, &counter_high_vec);
+  __m512i block_len_vec = set1_512(block_len);
+  __m512i block_flags_vec = set1_512(flags);
+  __m512i v[16] = {
+      h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
+      h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
+      set1_512(IV[0]), set1_512(IV[1]),  set1_512(IV[2]), set1_512(IV[3]),
+      counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
+  };
+  round_fn16(v, msg_vecs, 0);
+  round_fn16(v, msg_vecs, 1);
+  round_fn16(v, msg_vecs, 2);
+  round_fn16(v, msg_vecs, 3);
+  round_fn16(v, msg_vecs, 4);
+  round_fn16(v, msg_vecs, 5);
+  round_fn16(v, msg_vecs, 6);
+  for (size_t i = 0; i < 8; i++) {
+      v[i] = xor_512(v[i], v[i+8]);
+      v[i+8] = xor_512(v[i+8], h_vecs[i]);
+  }
+  transpose_vecs_512(&v[0]);
+  for (size_t i = 0; i < 16; i++) {
+      storeu_512(v[i], &out[i * sizeof(__m512i)]);
+  }
+}
+
 /*
  * ----------------------------------------------------------------------------
  * hash_many_avx512
@@ -1218,3 +1356,33 @@ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
     out = &out[BLAKE3_OUT_LEN];
   }
 }
+
+void blake3_xof_many_avx512(const uint32_t cv[8],
+                            const uint8_t block[BLAKE3_BLOCK_LEN],
+                            uint8_t block_len, uint64_t counter, uint8_t flags,
+                            uint8_t* out, size_t outblocks) {
+  while (outblocks >= 16) {
+    blake3_xof16_avx512(cv, block, block_len, counter, flags, out);
+    counter += 16;
+    outblocks -= 16;
+    out += 16 * BLAKE3_BLOCK_LEN;
+  }
+  while (outblocks >= 8) {
+    blake3_xof8_avx512(cv, block, block_len, counter, flags, out);
+    counter += 8;
+    outblocks -= 8;
+    out += 8 * BLAKE3_BLOCK_LEN;
+  }
+  while (outblocks >= 4) {
+    blake3_xof4_avx512(cv, block, block_len, counter, flags, out);
+    counter += 4;
+    outblocks -= 4;
+    out += 4 * BLAKE3_BLOCK_LEN;
+  }
+  while (outblocks > 0) {
+    blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
+    counter += 1;
+    outblocks -= 1;
+    out += BLAKE3_BLOCK_LEN;
+  }
+}
diff --git a/third-party/blake3/c/blake3_avx512_x86-64_unix.S b/third-party/blake3/c/blake3_avx512_x86-64_unix.S
index a06aede0f1..9642e413a3 100644
--- a/third-party/blake3/c/blake3_avx512_x86-64_unix.S
+++ b/third-party/blake3/c/blake3_avx512_x86-64_unix.S
@@ -19,6 +19,8 @@
 .global _blake3_compress_in_place_avx512
 .global blake3_compress_xof_avx512
 .global _blake3_compress_xof_avx512
+.global blake3_xof_many_avx512
+.global _blake3_xof_many_avx512
 
 #ifdef __APPLE__
 .text
@@ -2553,6 +2555,2243 @@ blake3_compress_xof_avx512:
         vmovdqu xmmword ptr [r9+0x30], xmm3
         ret
 
+.p2align 6
+blake3_xof_many_avx512:
+_blake3_xof_many_avx512:
+        _CET_ENDBR
+        mov    r10,QWORD PTR [rsp+0x8]
+        cmp    r10,0x1
+        ja     2f
+        vmovdqu xmm0,XMMWORD PTR [rdi]
+        vmovdqu xmm1,XMMWORD PTR [rdi+0x10]
+        movzx  eax,r8b
+        movzx  edx,dl
+        shl    rax,0x20
+        add    rdx,rax
+        vmovq  xmm3,rcx
+        vmovq  xmm4,rdx
+        vpunpcklqdq xmm3,xmm3,xmm4
+        vmovaps xmm2,XMMWORD PTR [BLAKE3_IV+rip]
+        vmovups xmm8,XMMWORD PTR [rsi]
+        vmovups xmm9,XMMWORD PTR [rsi+0x10]
+        vshufps xmm4,xmm8,xmm9,0x88
+        vshufps xmm5,xmm8,xmm9,0xdd
+        vmovups xmm8,XMMWORD PTR [rsi+0x20]
+        vmovups xmm9,XMMWORD PTR [rsi+0x30]
+        vshufps xmm6,xmm8,xmm9,0x88
+        vshufps xmm7,xmm8,xmm9,0xdd
+        vpshufd xmm6,xmm6,0x93
+        vpshufd xmm7,xmm7,0x93
+        mov    al,0x7
+3:
+        vpaddd xmm0,xmm0,xmm4
+        vpaddd xmm0,xmm0,xmm1
+        vpxord xmm3,xmm3,xmm0
+        vprord xmm3,xmm3,0x10
+        vpaddd xmm2,xmm2,xmm3
+        vpxord xmm1,xmm1,xmm2
+        vprord xmm1,xmm1,0xc
+        vpaddd xmm0,xmm0,xmm5
+        vpaddd xmm0,xmm0,xmm1
+        vpxord xmm3,xmm3,xmm0
+        vprord xmm3,xmm3,0x8
+        vpaddd xmm2,xmm2,xmm3
+        vpxord xmm1,xmm1,xmm2
+        vprord xmm1,xmm1,0x7
+        vpshufd xmm0,xmm0,0x93
+        vpshufd xmm3,xmm3,0x4e
+        vpshufd xmm2,xmm2,0x39
+        vpaddd xmm0,xmm0,xmm6
+        vpaddd xmm0,xmm0,xmm1
+        vpxord xmm3,xmm3,xmm0
+        vprord xmm3,xmm3,0x10
+        vpaddd xmm2,xmm2,xmm3
+        vpxord xmm1,xmm1,xmm2
+        vprord xmm1,xmm1,0xc
+        vpaddd xmm0,xmm0,xmm7
+        vpaddd xmm0,xmm0,xmm1
+        vpxord xmm3,xmm3,xmm0
+        vprord xmm3,xmm3,0x8
+        vpaddd xmm2,xmm2,xmm3
+        vpxord xmm1,xmm1,xmm2
+        vprord xmm1,xmm1,0x7
+        vpshufd xmm0,xmm0,0x39
+        vpshufd xmm3,xmm3,0x4e
+        vpshufd xmm2,xmm2,0x93
+        dec    al
+        je     3f
+        vshufps xmm8,xmm4,xmm5,0xd6
+        vpshufd xmm9,xmm4,0xf
+        vpshufd xmm4,xmm8,0x39
+        vshufps xmm8,xmm6,xmm7,0xfa
+        vpblendd xmm9,xmm9,xmm8,0xaa
+        vpunpcklqdq xmm8,xmm7,xmm5
+        vpblendd xmm8,xmm8,xmm6,0x88
+        vpshufd xmm8,xmm8,0x78
+        vpunpckhdq xmm5,xmm5,xmm7
+        vpunpckldq xmm6,xmm6,xmm5
+        vpshufd xmm7,xmm6,0x1e
+        vmovdqa xmm5,xmm9
+        vmovdqa xmm6,xmm8
+        jmp    3b
+3:
+        vpxor  xmm0,xmm0,xmm2
+        vpxor  xmm1,xmm1,xmm3
+        vpxor  xmm2,xmm2,XMMWORD PTR [rdi]
+        vpxor  xmm3,xmm3,XMMWORD PTR [rdi+0x10]
+        vmovdqu XMMWORD PTR [r9],xmm0
+        vmovdqu XMMWORD PTR [r9+0x10],xmm1
+        vmovdqu XMMWORD PTR [r9+0x20],xmm2
+        vmovdqu XMMWORD PTR [r9+0x30],xmm3
+        ret    
+.p2align 6
+2:
+        push   rbp
+        mov    rbp,rsp
+        sub    rsp,0x90
+        and    rsp,0xffffffffffffffc0
+        vpbroadcastd zmm0,ecx
+        shr    rcx,0x20
+        vpbroadcastd zmm1,ecx
+        vpaddd zmm2,zmm0,ZMMWORD PTR [ADD0+rip]
+        vpcmpltud k1,zmm2,zmm0
+        vpaddd zmm1{k1},zmm1,DWORD PTR [ADD1+rip]{1to16}
+        vmovdqa32 ZMMWORD PTR [rsp],zmm2
+        vmovdqa32 ZMMWORD PTR [rsp+0x40],zmm1
+        cmp    r10,0x10
+        jb     2f
+3:
+        vpbroadcastd zmm16,DWORD PTR [rsi]
+        vpbroadcastd zmm17,DWORD PTR [rsi+0x4]
+        vpbroadcastd zmm18,DWORD PTR [rsi+0x8]
+        vpbroadcastd zmm19,DWORD PTR [rsi+0xc]
+        vpbroadcastd zmm20,DWORD PTR [rsi+0x10]
+        vpbroadcastd zmm21,DWORD PTR [rsi+0x14]
+        vpbroadcastd zmm22,DWORD PTR [rsi+0x18]
+        vpbroadcastd zmm23,DWORD PTR [rsi+0x1c]
+        vpbroadcastd zmm24,DWORD PTR [rsi+0x20]
+        vpbroadcastd zmm25,DWORD PTR [rsi+0x24]
+        vpbroadcastd zmm26,DWORD PTR [rsi+0x28]
+        vpbroadcastd zmm27,DWORD PTR [rsi+0x2c]
+        vpbroadcastd zmm28,DWORD PTR [rsi+0x30]
+        vpbroadcastd zmm29,DWORD PTR [rsi+0x34]
+        vpbroadcastd zmm30,DWORD PTR [rsi+0x38]
+        vpbroadcastd zmm31,DWORD PTR [rsi+0x3c]
+        vpbroadcastd zmm0,DWORD PTR [rdi]
+        vpbroadcastd zmm1,DWORD PTR [rdi+0x4]
+        vpbroadcastd zmm2,DWORD PTR [rdi+0x8]
+        vpbroadcastd zmm3,DWORD PTR [rdi+0xc]
+        vpbroadcastd zmm4,DWORD PTR [rdi+0x10]
+        vpbroadcastd zmm5,DWORD PTR [rdi+0x14]
+        vpbroadcastd zmm6,DWORD PTR [rdi+0x18]
+        vpbroadcastd zmm7,DWORD PTR [rdi+0x1c]
+        vpbroadcastd zmm8,DWORD PTR [BLAKE3_IV_0+rip]
+        vpbroadcastd zmm9,DWORD PTR [BLAKE3_IV_1+rip]
+        vpbroadcastd zmm10,DWORD PTR [BLAKE3_IV_2+rip]
+        vpbroadcastd zmm11,DWORD PTR [BLAKE3_IV_3+rip]
+        vmovdqa32 zmm12,ZMMWORD PTR [rsp]
+        vmovdqa32 zmm13,ZMMWORD PTR [rsp+0x40]
+        vpbroadcastd zmm14,edx
+        vpbroadcastd zmm15,r8d
+        vpaddd zmm0,zmm0,zmm16
+        vpaddd zmm1,zmm1,zmm18
+        vpaddd zmm2,zmm2,zmm20
+        vpaddd zmm3,zmm3,zmm22
+        vpaddd zmm0,zmm0,zmm4
+        vpaddd zmm1,zmm1,zmm5
+        vpaddd zmm2,zmm2,zmm6
+        vpaddd zmm3,zmm3,zmm7
+        vpxord zmm12,zmm12,zmm0
+        vpxord zmm13,zmm13,zmm1
+        vpxord zmm14,zmm14,zmm2
+        vpxord zmm15,zmm15,zmm3
+        vprord zmm12,zmm12,0x10
+        vprord zmm13,zmm13,0x10
+        vprord zmm14,zmm14,0x10
+        vprord zmm15,zmm15,0x10
+        vpaddd zmm8,zmm8,zmm12
+        vpaddd zmm9,zmm9,zmm13
+        vpaddd zmm10,zmm10,zmm14
+        vpaddd zmm11,zmm11,zmm15
+        vpxord zmm4,zmm4,zmm8
+        vpxord zmm5,zmm5,zmm9
+        vpxord zmm6,zmm6,zmm10
+        vpxord zmm7,zmm7,zmm11
+        vprord zmm4,zmm4,0xc
+        vprord zmm5,zmm5,0xc
+        vprord zmm6,zmm6,0xc
+        vprord zmm7,zmm7,0xc
+        vpaddd zmm0,zmm0,zmm17
+        vpaddd zmm1,zmm1,zmm19
+        vpaddd zmm2,zmm2,zmm21
+        vpaddd zmm3,zmm3,zmm23
+        vpaddd zmm0,zmm0,zmm4
+        vpaddd zmm1,zmm1,zmm5
+        vpaddd zmm2,zmm2,zmm6
+        vpaddd zmm3,zmm3,zmm7
+        vpxord zmm12,zmm12,zmm0
+        vpxord zmm13,zmm13,zmm1
+        vpxord zmm14,zmm14,zmm2
+        vpxord zmm15,zmm15,zmm3
+        vprord zmm12,zmm12,0x8
+        vprord zmm13,zmm13,0x8
+        vprord zmm14,zmm14,0x8
+        vprord zmm15,zmm15,0x8
+        vpaddd zmm8,zmm8,zmm12
+        vpaddd zmm9,zmm9,zmm13
+        vpaddd zmm10,zmm10,zmm14
+        vpaddd zmm11,zmm11,zmm15
+        vpxord zmm4,zmm4,zmm8
+        vpxord zmm5,zmm5,zmm9
+        vpxord zmm6,zmm6,zmm10
+        vpxord zmm7,zmm7,zmm11
+        vprord zmm4,zmm4,0x7
+        vprord zmm5,zmm5,0x7
+        vprord zmm6,zmm6,0x7
+        vprord zmm7,zmm7,0x7
+        vpaddd zmm0,zmm0,zmm24
+        vpaddd zmm1,zmm1,zmm26
+        vpaddd zmm2,zmm2,zmm28
+        vpaddd zmm3,zmm3,zmm30
+        vpaddd zmm0,zmm0,zmm5
+        vpaddd zmm1,zmm1,zmm6
+        vpaddd zmm2,zmm2,zmm7
+        vpaddd zmm3,zmm3,zmm4
+        vpxord zmm15,zmm15,zmm0
+        vpxord zmm12,zmm12,zmm1
+        vpxord zmm13,zmm13,zmm2
+        vpxord zmm14,zmm14,zmm3
+        vprord zmm15,zmm15,0x10
+        vprord zmm12,zmm12,0x10
+        vprord zmm13,zmm13,0x10
+        vprord zmm14,zmm14,0x10
+        vpaddd zmm10,zmm10,zmm15
+        vpaddd zmm11,zmm11,zmm12
+        vpaddd zmm8,zmm8,zmm13
+        vpaddd zmm9,zmm9,zmm14
+        vpxord zmm5,zmm5,zmm10
+        vpxord zmm6,zmm6,zmm11
+        vpxord zmm7,zmm7,zmm8
+        vpxord zmm4,zmm4,zmm9
+        vprord zmm5,zmm5,0xc
+        vprord zmm6,zmm6,0xc
+        vprord zmm7,zmm7,0xc
+        vprord zmm4,zmm4,0xc
+        vpaddd zmm0,zmm0,zmm25
+        vpaddd zmm1,zmm1,zmm27
+        vpaddd zmm2,zmm2,zmm29
+        vpaddd zmm3,zmm3,zmm31
+        vpaddd zmm0,zmm0,zmm5
+        vpaddd zmm1,zmm1,zmm6
+        vpaddd zmm2,zmm2,zmm7
+        vpaddd zmm3,zmm3,zmm4
+        vpxord zmm15,zmm15,zmm0
+        vpxord zmm12,zmm12,zmm1
+        vpxord zmm13,zmm13,zmm2
+        vpxord zmm14,zmm14,zmm3
+        vprord zmm15,zmm15,0x8
+        vprord zmm12,zmm12,0x8
+        vprord zmm13,zmm13,0x8
+        vprord zmm14,zmm14,0x8
+        vpaddd zmm10,zmm10,zmm15
+        vpaddd zmm11,zmm11,zmm12
+        vpaddd zmm8,zmm8,zmm13
+        vpaddd zmm9,zmm9,zmm14
+        vpxord zmm5,zmm5,zmm10
+        vpxord zmm6,zmm6,zmm11
+        vpxord zmm7,zmm7,zmm8
+        vpxord zmm4,zmm4,zmm9
+        vprord zmm5,zmm5,0x7
+        vprord zmm6,zmm6,0x7
+        vprord zmm7,zmm7,0x7
+        vprord zmm4,zmm4,0x7
+        vpaddd zmm0,zmm0,zmm18
+        vpaddd zmm1,zmm1,zmm19
+        vpaddd zmm2,zmm2,zmm23
+        vpaddd zmm3,zmm3,zmm20
+        vpaddd zmm0,zmm0,zmm4
+        vpaddd zmm1,zmm1,zmm5
+        vpaddd zmm2,zmm2,zmm6
+        vpaddd zmm3,zmm3,zmm7
+        vpxord zmm12,zmm12,zmm0
+        vpxord zmm13,zmm13,zmm1
+        vpxord zmm14,zmm14,zmm2
+        vpxord zmm15,zmm15,zmm3
+        vprord zmm12,zmm12,0x10
+        vprord zmm13,zmm13,0x10
+        vprord zmm14,zmm14,0x10
+        vprord zmm15,zmm15,0x10
+        vpaddd zmm8,zmm8,zmm12
+        vpaddd zmm9,zmm9,zmm13
+        vpaddd zmm10,zmm10,zmm14
+        vpaddd zmm11,zmm11,zmm15
+        vpxord zmm4,zmm4,zmm8
+        vpxord zmm5,zmm5,zmm9
+        vpxord zmm6,zmm6,zmm10
+        vpxord zmm7,zmm7,zmm11
+        vprord zmm4,zmm4,0xc
+        vprord zmm5,zmm5,0xc
+        vprord zmm6,zmm6,0xc
+        vprord zmm7,zmm7,0xc
+        vpaddd zmm0,zmm0,zmm22
+        vpaddd zmm1,zmm1,zmm26
+        vpaddd zmm2,zmm2,zmm16
+        vpaddd zmm3,zmm3,zmm29
+        vpaddd zmm0,zmm0,zmm4
+        vpaddd zmm1,zmm1,zmm5
+        vpaddd zmm2,zmm2,zmm6
+        vpaddd zmm3,zmm3,zmm7
+        vpxord zmm12,zmm12,zmm0
+        vpxord zmm13,zmm13,zmm1
+        vpxord zmm14,zmm14,zmm2
+        vpxord zmm15,zmm15,zmm3
+        vprord zmm12,zmm12,0x8
+        vprord zmm13,zmm13,0x8
+        vprord zmm14,zmm14,0x8
+        vprord zmm15,zmm15,0x8
+        vpaddd zmm8,zmm8,zmm12
+        vpaddd zmm9,zmm9,zmm13
+        vpaddd zmm10,zmm10,zmm14
+        vpaddd zmm11,zmm11,zmm15
+        vpxord zmm4,zmm4,zmm8
+        vpxord zmm5,zmm5,zmm9
+        vpxord zmm6,zmm6,zmm10
+        vpxord zmm7,zmm7,zmm11
+        vprord zmm4,zmm4,0x7
+        vprord zmm5,zmm5,0x7
+        vprord zmm6,zmm6,0x7
+        vprord zmm7,zmm7,0x7
+        vpaddd zmm0,zmm0,zmm17
+        vpaddd zmm1,zmm1,zmm28
+        vpaddd zmm2,zmm2,zmm25
+        vpaddd zmm3,zmm3,zmm31
+        vpaddd zmm0,zmm0,zmm5
+        vpaddd zmm1,zmm1,zmm6
+        vpaddd zmm2,zmm2,zmm7
+        vpaddd zmm3,zmm3,zmm4
+        vpxord zmm15,zmm15,zmm0
+        vpxord zmm12,zmm12,zmm1
+        vpxord zmm13,zmm13,zmm2
+        vpxord zmm14,zmm14,zmm3
+        vprord zmm15,zmm15,0x10
+        vprord zmm12,zmm12,0x10
+        vprord zmm13,zmm13,0x10
+        vprord zmm14,zmm14,0x10
+        vpaddd zmm10,zmm10,zmm15
+        vpaddd zmm11,zmm11,zmm12
+        vpaddd zmm8,zmm8,zmm13
+        vpaddd zmm9,zmm9,zmm14
+        vpxord zmm5,zmm5,zmm10
+        vpxord zmm6,zmm6,zmm11
+        vpxord zmm7,zmm7,zmm8
+        vpxord zmm4,zmm4,zmm9
+        vprord zmm5,zmm5,0xc
+        vprord zmm6,zmm6,0xc
+        vprord zmm7,zmm7,0xc
+        vprord zmm4,zmm4,0xc
+        vpaddd zmm0,zmm0,zmm27
+        vpaddd zmm1,zmm1,zmm21
+        vpaddd zmm2,zmm2,zmm30
+        vpaddd zmm3,zmm3,zmm24
+        vpaddd zmm0,zmm0,zmm5
+        vpaddd zmm1,zmm1,zmm6
+        vpaddd zmm2,zmm2,zmm7
+        vpaddd zmm3,zmm3,zmm4
+        vpxord zmm15,zmm15,zmm0
+        vpxord zmm12,zmm12,zmm1
+        vpxord zmm13,zmm13,zmm2
+        vpxord zmm14,zmm14,zmm3
+        vprord zmm15,zmm15,0x8
+        vprord zmm12,zmm12,0x8
+        vprord zmm13,zmm13,0x8
+        vprord zmm14,zmm14,0x8
+        vpaddd zmm10,zmm10,zmm15
+        vpaddd zmm11,zmm11,zmm12
+        vpaddd zmm8,zmm8,zmm13
+        vpaddd zmm9,zmm9,zmm14
+        vpxord zmm5,zmm5,zmm10
+        vpxord zmm6,zmm6,zmm11
+        vpxord zmm7,zmm7,zmm8
+        vpxord zmm4,zmm4,zmm9
+        vprord zmm5,zmm5,0x7
+        vprord zmm6,zmm6,0x7
+        vprord zmm7,zmm7,0x7
+        vprord zmm4,zmm4,0x7
+        vpaddd zmm0,zmm0,zmm19
+        vpaddd zmm1,zmm1,zmm26
+        vpaddd zmm2,zmm2,zmm29
+        vpaddd zmm3,zmm3,zmm23
+        vpaddd zmm0,zmm0,zmm4
+        vpaddd zmm1,zmm1,zmm5
+        vpaddd zmm2,zmm2,zmm6
+        vpaddd zmm3,zmm3,zmm7
+        vpxord zmm12,zmm12,zmm0
+        vpxord zmm13,zmm13,zmm1
+        vpxord zmm14,zmm14,zmm2
+        vpxord zmm15,zmm15,zmm3
+        vprord zmm12,zmm12,0x10
+        vprord zmm13,zmm13,0x10
+        vprord zmm14,zmm14,0x10
+        vprord zmm15,zmm15,0x10
+        vpaddd zmm8,zmm8,zmm12
+        vpaddd zmm9,zmm9,zmm13
+        vpaddd zmm10,zmm10,zmm14
+        vpaddd zmm11,zmm11,zmm15
+        vpxord zmm4,zmm4,zmm8
+        vpxord zmm5,zmm5,zmm9
+        vpxord zmm6,zmm6,zmm10
+        vpxord zmm7,zmm7,zmm11
+        vprord zmm4,zmm4,0xc
+        vprord zmm5,zmm5,0xc
+        vprord zmm6,zmm6,0xc
+        vprord zmm7,zmm7,0xc
+        vpaddd zmm0,zmm0,zmm20
+        vpaddd zmm1,zmm1,zmm28
+        vpaddd zmm2,zmm2,zmm18
+        vpaddd zmm3,zmm3,zmm30
+        vpaddd zmm0,zmm0,zmm4
+        vpaddd zmm1,zmm1,zmm5
+        vpaddd zmm2,zmm2,zmm6
+        vpaddd zmm3,zmm3,zmm7
+        vpxord zmm12,zmm12,zmm0
+        vpxord zmm13,zmm13,zmm1
+        vpxord zmm14,zmm14,zmm2
+        vpxord zmm15,zmm15,zmm3
+        vprord zmm12,zmm12,0x8
+        vprord zmm13,zmm13,0x8
+        vprord zmm14,zmm14,0x8
+        vprord zmm15,zmm15,0x8
+        vpaddd zmm8,zmm8,zmm12
+        vpaddd zmm9,zmm9,zmm13
+        vpaddd zmm10,zmm10,zmm14
+        vpaddd zmm11,zmm11,zmm15
+        vpxord zmm4,zmm4,zmm8
+        vpxord zmm5,zmm5,zmm9
+        vpxord zmm6,zmm6,zmm10
+        vpxord zmm7,zmm7,zmm11
+        vprord zmm4,zmm4,0x7
+        vprord zmm5,zmm5,0x7
+        vprord zmm6,zmm6,0x7
+        vprord zmm7,zmm7,0x7
+        vpaddd zmm0,zmm0,zmm22
+        vpaddd zmm1,zmm1,zmm25
+        vpaddd zmm2,zmm2,zmm27
+        vpaddd zmm3,zmm3,zmm24
+        vpaddd zmm0,zmm0,zmm5
+        vpaddd zmm1,zmm1,zmm6
+        vpaddd zmm2,zmm2,zmm7
+        vpaddd zmm3,zmm3,zmm4
+        vpxord zmm15,zmm15,zmm0
+        vpxord zmm12,zmm12,zmm1
+        vpxord zmm13,zmm13,zmm2
+        vpxord zmm14,zmm14,zmm3
+        vprord zmm15,zmm15,0x10
+        vprord zmm12,zmm12,0x10
+        vprord zmm13,zmm13,0x10
+        vprord zmm14,zmm14,0x10
+        vpaddd zmm10,zmm10,zmm15
+        vpaddd zmm11,zmm11,zmm12
+        vpaddd zmm8,zmm8,zmm13
+        vpaddd zmm9,zmm9,zmm14
+        vpxord zmm5,zmm5,zmm10
+        vpxord zmm6,zmm6,zmm11
+        vpxord zmm7,zmm7,zmm8
+        vpxord zmm4,zmm4,zmm9
+        vprord zmm5,zmm5,0xc
+        vprord zmm6,zmm6,0xc
+        vprord zmm7,zmm7,0xc
+        vprord zmm4,zmm4,0xc
+        vpaddd zmm0,zmm0,zmm21
+        vpaddd zmm1,zmm1,zmm16
+        vpaddd zmm2,zmm2,zmm31
+        vpaddd zmm3,zmm3,zmm17
+        vpaddd zmm0,zmm0,zmm5
+        vpaddd zmm1,zmm1,zmm6
+        vpaddd zmm2,zmm2,zmm7
+        vpaddd zmm3,zmm3,zmm4
+        vpxord zmm15,zmm15,zmm0
+        vpxord zmm12,zmm12,zmm1
+        vpxord zmm13,zmm13,zmm2
+        vpxord zmm14,zmm14,zmm3
+        vprord zmm15,zmm15,0x8
+        vprord zmm12,zmm12,0x8
+        vprord zmm13,zmm13,0x8
+        vprord zmm14,zmm14,0x8
+        vpaddd zmm10,zmm10,zmm15
+        vpaddd zmm11,zmm11,zmm12
+        vpaddd zmm8,zmm8,zmm13
+        vpaddd zmm9,zmm9,zmm14
+        vpxord zmm5,zmm5,zmm10
+        vpxord zmm6,zmm6,zmm11
+        vpxord zmm7,zmm7,zmm8
+        vpxord zmm4,zmm4,zmm9
+        vprord zmm5,zmm5,0x7
+        vprord zmm6,zmm6,0x7
+        vprord zmm7,zmm7,0x7
+        vprord zmm4,zmm4,0x7
+        vpaddd zmm0,zmm0,zmm26
+        vpaddd zmm1,zmm1,zmm28
+        vpaddd zmm2,zmm2,zmm30
+        vpaddd zmm3,zmm3,zmm29
+        vpaddd zmm0,zmm0,zmm4
+        vpaddd zmm1,zmm1,zmm5
+        vpaddd zmm2,zmm2,zmm6
+        vpaddd zmm3,zmm3,zmm7
+        vpxord zmm12,zmm12,zmm0
+        vpxord zmm13,zmm13,zmm1
+        vpxord zmm14,zmm14,zmm2
+        vpxord zmm15,zmm15,zmm3
+        vprord zmm12,zmm12,0x10
+        vprord zmm13,zmm13,0x10
+        vprord zmm14,zmm14,0x10
+        vprord zmm15,zmm15,0x10
+        vpaddd zmm8,zmm8,zmm12
+        vpaddd zmm9,zmm9,zmm13
+        vpaddd zmm10,zmm10,zmm14
+        vpaddd zmm11,zmm11,zmm15
+        vpxord zmm4,zmm4,zmm8
+        vpxord zmm5,zmm5,zmm9
+        vpxord zmm6,zmm6,zmm10
+        vpxord zmm7,zmm7,zmm11
+        vprord zmm4,zmm4,0xc
+        vprord zmm5,zmm5,0xc
+        vprord zmm6,zmm6,0xc
+        vprord zmm7,zmm7,0xc
+        vpaddd zmm0,zmm0,zmm23
+        vpaddd zmm1,zmm1,zmm25
+        vpaddd zmm2,zmm2,zmm19
+        vpaddd zmm3,zmm3,zmm31
+        vpaddd zmm0,zmm0,zmm4
+        vpaddd zmm1,zmm1,zmm5
+        vpaddd zmm2,zmm2,zmm6
+        vpaddd zmm3,zmm3,zmm7
+        vpxord zmm12,zmm12,zmm0
+        vpxord zmm13,zmm13,zmm1
+        vpxord zmm14,zmm14,zmm2
+        vpxord zmm15,zmm15,zmm3
+        vprord zmm12,zmm12,0x8
+        vprord zmm13,zmm13,0x8
+        vprord zmm14,zmm14,0x8
+        vprord zmm15,zmm15,0x8
+        vpaddd zmm8,zmm8,zmm12
+        vpaddd zmm9,zmm9,zmm13
+        vpaddd zmm10,zmm10,zmm14
+        vpaddd zmm11,zmm11,zmm15
+        vpxord zmm4,zmm4,zmm8
+        vpxord zmm5,zmm5,zmm9
+        vpxord zmm6,zmm6,zmm10
+        vpxord zmm7,zmm7,zmm11
+        vprord zmm4,zmm4,0x7
+        vprord zmm5,zmm5,0x7
+        vprord zmm6,zmm6,0x7
+        vprord zmm7,zmm7,0x7
+        vpaddd zmm0,zmm0,zmm20
+        vpaddd zmm1,zmm1,zmm27
+        vpaddd zmm2,zmm2,zmm21
+        vpaddd zmm3,zmm3,zmm17
+        vpaddd zmm0,zmm0,zmm5
+        vpaddd zmm1,zmm1,zmm6
+        vpaddd zmm2,zmm2,zmm7
+        vpaddd zmm3,zmm3,zmm4
+        vpxord zmm15,zmm15,zmm0
+        vpxord zmm12,zmm12,zmm1
+        vpxord zmm13,zmm13,zmm2
+        vpxord zmm14,zmm14,zmm3
+        vprord zmm15,zmm15,0x10
+        vprord zmm12,zmm12,0x10
+        vprord zmm13,zmm13,0x10
+        vprord zmm14,zmm14,0x10
+        vpaddd zmm10,zmm10,zmm15
+        vpaddd zmm11,zmm11,zmm12
+        vpaddd zmm8,zmm8,zmm13
+        vpaddd zmm9,zmm9,zmm14
+        vpxord zmm5,zmm5,zmm10
+        vpxord zmm6,zmm6,zmm11
+        vpxord zmm7,zmm7,zmm8
+        vpxord zmm4,zmm4,zmm9
+        vprord zmm5,zmm5,0xc
+        vprord zmm6,zmm6,0xc
+        vprord zmm7,zmm7,0xc
+        vprord zmm4,zmm4,0xc
+        vpaddd zmm0,zmm0,zmm16
+        vpaddd zmm1,zmm1,zmm18
+        vpaddd zmm2,zmm2,zmm24
+        vpaddd zmm3,zmm3,zmm22
+        vpaddd zmm0,zmm0,zmm5
+        vpaddd zmm1,zmm1,zmm6
+        vpaddd zmm2,zmm2,zmm7
+        vpaddd zmm3,zmm3,zmm4
+        vpxord zmm15,zmm15,zmm0
+        vpxord zmm12,zmm12,zmm1
+        vpxord zmm13,zmm13,zmm2
+        vpxord zmm14,zmm14,zmm3
+        vprord zmm15,zmm15,0x8
+        vprord zmm12,zmm12,0x8
+        vprord zmm13,zmm13,0x8
+        vprord zmm14,zmm14,0x8
+        vpaddd zmm10,zmm10,zmm15
+        vpaddd zmm11,zmm11,zmm12
+        vpaddd zmm8,zmm8,zmm13
+        vpaddd zmm9,zmm9,zmm14
+        vpxord zmm5,zmm5,zmm10
+        vpxord zmm6,zmm6,zmm11
+        vpxord zmm7,zmm7,zmm8
+        vpxord zmm4,zmm4,zmm9
+        vprord zmm5,zmm5,0x7
+        vprord zmm6,zmm6,0x7
+        vprord zmm7,zmm7,0x7
+        vprord zmm4,zmm4,0x7
+        vpaddd zmm0,zmm0,zmm28
+        vpaddd zmm1,zmm1,zmm25
+        vpaddd zmm2,zmm2,zmm31
+        vpaddd zmm3,zmm3,zmm30
+        vpaddd zmm0,zmm0,zmm4
+        vpaddd zmm1,zmm1,zmm5
+        vpaddd zmm2,zmm2,zmm6
+        vpaddd zmm3,zmm3,zmm7
+        vpxord zmm12,zmm12,zmm0
+        vpxord zmm13,zmm13,zmm1
+        vpxord zmm14,zmm14,zmm2
+        vpxord zmm15,zmm15,zmm3
+        vprord zmm12,zmm12,0x10
+        vprord zmm13,zmm13,0x10
+        vprord zmm14,zmm14,0x10
+        vprord zmm15,zmm15,0x10
+        vpaddd zmm8,zmm8,zmm12
+        vpaddd zmm9,zmm9,zmm13
+        vpaddd zmm10,zmm10,zmm14
+        vpaddd zmm11,zmm11,zmm15
+        vpxord zmm4,zmm4,zmm8
+        vpxord zmm5,zmm5,zmm9
+        vpxord zmm6,zmm6,zmm10
+        vpxord zmm7,zmm7,zmm11
+        vprord zmm4,zmm4,0xc
+        vprord zmm5,zmm5,0xc
+        vprord zmm6,zmm6,0xc
+        vprord zmm7,zmm7,0xc
+        vpaddd zmm0,zmm0,zmm29
+        vpaddd zmm1,zmm1,zmm27
+        vpaddd zmm2,zmm2,zmm26
+        vpaddd zmm3,zmm3,zmm24
+        vpaddd zmm0,zmm0,zmm4
+        vpaddd zmm1,zmm1,zmm5
+        vpaddd zmm2,zmm2,zmm6
+        vpaddd zmm3,zmm3,zmm7
+        vpxord zmm12,zmm12,zmm0
+        vpxord zmm13,zmm13,zmm1
+        vpxord zmm14,zmm14,zmm2
+        vpxord zmm15,zmm15,zmm3
+        vprord zmm12,zmm12,0x8
+        vprord zmm13,zmm13,0x8
+        vprord zmm14,zmm14,0x8
+        vprord zmm15,zmm15,0x8
+        vpaddd zmm8,zmm8,zmm12
+        vpaddd zmm9,zmm9,zmm13
+        vpaddd zmm10,zmm10,zmm14
+        vpaddd zmm11,zmm11,zmm15
+        vpxord zmm4,zmm4,zmm8
+        vpxord zmm5,zmm5,zmm9
+        vpxord zmm6,zmm6,zmm10
+        vpxord zmm7,zmm7,zmm11
+        vprord zmm4,zmm4,0x7
+        vprord zmm5,zmm5,0x7
+        vprord zmm6,zmm6,0x7
+        vprord zmm7,zmm7,0x7
+        vpaddd zmm0,zmm0,zmm23
+        vpaddd zmm1,zmm1,zmm21
+        vpaddd zmm2,zmm2,zmm16
+        vpaddd zmm3,zmm3,zmm22
+        vpaddd zmm0,zmm0,zmm5
+        vpaddd zmm1,zmm1,zmm6
+        vpaddd zmm2,zmm2,zmm7
+        vpaddd zmm3,zmm3,zmm4
+        vpxord zmm15,zmm15,zmm0
+        vpxord zmm12,zmm12,zmm1
+        vpxord zmm13,zmm13,zmm2
+        vpxord zmm14,zmm14,zmm3
+        vprord zmm15,zmm15,0x10
+        vprord zmm12,zmm12,0x10
+        vprord zmm13,zmm13,0x10
+        vprord zmm14,zmm14,0x10
+        vpaddd zmm10,zmm10,zmm15
+        vpaddd zmm11,zmm11,zmm12
+        vpaddd zmm8,zmm8,zmm13
+        vpaddd zmm9,zmm9,zmm14
+        vpxord zmm5,zmm5,zmm10
+        vpxord zmm6,zmm6,zmm11
+        vpxord zmm7,zmm7,zmm8
+        vpxord zmm4,zmm4,zmm9
+        vprord zmm5,zmm5,0xc
+        vprord zmm6,zmm6,0xc
+        vprord zmm7,zmm7,0xc
+        vprord zmm4,zmm4,0xc
+        vpaddd zmm0,zmm0,zmm18
+        vpaddd zmm1,zmm1,zmm19
+        vpaddd zmm2,zmm2,zmm17
+        vpaddd zmm3,zmm3,zmm20
+        vpaddd zmm0,zmm0,zmm5
+        vpaddd zmm1,zmm1,zmm6
+        vpaddd zmm2,zmm2,zmm7
+        vpaddd zmm3,zmm3,zmm4
+        vpxord zmm15,zmm15,zmm0
+        vpxord zmm12,zmm12,zmm1
+        vpxord zmm13,zmm13,zmm2
+        vpxord zmm14,zmm14,zmm3
+        vprord zmm15,zmm15,0x8
+        vprord zmm12,zmm12,0x8
+        vprord zmm13,zmm13,0x8
+        vprord zmm14,zmm14,0x8
+        vpaddd zmm10,zmm10,zmm15
+        vpaddd zmm11,zmm11,zmm12
+        vpaddd zmm8,zmm8,zmm13
+        vpaddd zmm9,zmm9,zmm14
+        vpxord zmm5,zmm5,zmm10
+        vpxord zmm6,zmm6,zmm11
+        vpxord zmm7,zmm7,zmm8
+        vpxord zmm4,zmm4,zmm9
+        vprord zmm5,zmm5,0x7
+        vprord zmm6,zmm6,0x7
+        vprord zmm7,zmm7,0x7
+        vprord zmm4,zmm4,0x7
+        vpaddd zmm0,zmm0,zmm25
+        vpaddd zmm1,zmm1,zmm27
+        vpaddd zmm2,zmm2,zmm24
+        vpaddd zmm3,zmm3,zmm31
+        vpaddd zmm0,zmm0,zmm4
+        vpaddd zmm1,zmm1,zmm5
+        vpaddd zmm2,zmm2,zmm6
+        vpaddd zmm3,zmm3,zmm7
+        vpxord zmm12,zmm12,zmm0
+        vpxord zmm13,zmm13,zmm1
+        vpxord zmm14,zmm14,zmm2
+        vpxord zmm15,zmm15,zmm3
+        vprord zmm12,zmm12,0x10
+        vprord zmm13,zmm13,0x10
+        vprord zmm14,zmm14,0x10
+        vprord zmm15,zmm15,0x10
+        vpaddd zmm8,zmm8,zmm12
+        vpaddd zmm9,zmm9,zmm13
+        vpaddd zmm10,zmm10,zmm14
+        vpaddd zmm11,zmm11,zmm15
+        vpxord zmm4,zmm4,zmm8
+        vpxord zmm5,zmm5,zmm9
+        vpxord zmm6,zmm6,zmm10
+        vpxord zmm7,zmm7,zmm11
+        vprord zmm4,zmm4,0xc
+        vprord zmm5,zmm5,0xc
+        vprord zmm6,zmm6,0xc
+        vprord zmm7,zmm7,0xc
+        vpaddd zmm0,zmm0,zmm30
+        vpaddd zmm1,zmm1,zmm21
+        vpaddd zmm2,zmm2,zmm28
+        vpaddd zmm3,zmm3,zmm17
+        vpaddd zmm0,zmm0,zmm4
+        vpaddd zmm1,zmm1,zmm5
+        vpaddd zmm2,zmm2,zmm6
+        vpaddd zmm3,zmm3,zmm7
+        vpxord zmm12,zmm12,zmm0
+        vpxord zmm13,zmm13,zmm1
+        vpxord zmm14,zmm14,zmm2
+        vpxord zmm15,zmm15,zmm3
+        vprord zmm12,zmm12,0x8
+        vprord zmm13,zmm13,0x8
+        vprord zmm14,zmm14,0x8
+        vprord zmm15,zmm15,0x8
+        vpaddd zmm8,zmm8,zmm12
+        vpaddd zmm9,zmm9,zmm13
+        vpaddd zmm10,zmm10,zmm14
+        vpaddd zmm11,zmm11,zmm15
+        vpxord zmm4,zmm4,zmm8
+        vpxord zmm5,zmm5,zmm9
+        vpxord zmm6,zmm6,zmm10
+        vpxord zmm7,zmm7,zmm11
+        vprord zmm4,zmm4,0x7
+        vprord zmm5,zmm5,0x7
+        vprord zmm6,zmm6,0x7
+        vprord zmm7,zmm7,0x7
+        vpaddd zmm0,zmm0,zmm29
+        vpaddd zmm1,zmm1,zmm16
+        vpaddd zmm2,zmm2,zmm18
+        vpaddd zmm3,zmm3,zmm20
+        vpaddd zmm0,zmm0,zmm5
+        vpaddd zmm1,zmm1,zmm6
+        vpaddd zmm2,zmm2,zmm7
+        vpaddd zmm3,zmm3,zmm4
+        vpxord zmm15,zmm15,zmm0
+        vpxord zmm12,zmm12,zmm1
+        vpxord zmm13,zmm13,zmm2
+        vpxord zmm14,zmm14,zmm3
+        vprord zmm15,zmm15,0x10
+        vprord zmm12,zmm12,0x10
+        vprord zmm13,zmm13,0x10
+        vprord zmm14,zmm14,0x10
+        vpaddd zmm10,zmm10,zmm15
+        vpaddd zmm11,zmm11,zmm12
+        vpaddd zmm8,zmm8,zmm13
+        vpaddd zmm9,zmm9,zmm14
+        vpxord zmm5,zmm5,zmm10
+        vpxord zmm6,zmm6,zmm11
+        vpxord zmm7,zmm7,zmm8
+        vpxord zmm4,zmm4,zmm9
+        vprord zmm5,zmm5,0xc
+        vprord zmm6,zmm6,0xc
+        vprord zmm7,zmm7,0xc
+        vprord zmm4,zmm4,0xc
+        vpaddd zmm0,zmm0,zmm19
+        vpaddd zmm1,zmm1,zmm26
+        vpaddd zmm2,zmm2,zmm22
+        vpaddd zmm3,zmm3,zmm23
+        vpaddd zmm0,zmm0,zmm5
+        vpaddd zmm1,zmm1,zmm6
+        vpaddd zmm2,zmm2,zmm7
+        vpaddd zmm3,zmm3,zmm4
+        vpxord zmm15,zmm15,zmm0
+        vpxord zmm12,zmm12,zmm1
+        vpxord zmm13,zmm13,zmm2
+        vpxord zmm14,zmm14,zmm3
+        vprord zmm15,zmm15,0x8
+        vprord zmm12,zmm12,0x8
+        vprord zmm13,zmm13,0x8
+        vprord zmm14,zmm14,0x8
+        vpaddd zmm10,zmm10,zmm15
+        vpaddd zmm11,zmm11,zmm12
+        vpaddd zmm8,zmm8,zmm13
+        vpaddd zmm9,zmm9,zmm14
+        vpxord zmm5,zmm5,zmm10
+        vpxord zmm6,zmm6,zmm11
+        vpxord zmm7,zmm7,zmm8
+        vpxord zmm4,zmm4,zmm9
+        vprord zmm5,zmm5,0x7
+        vprord zmm6,zmm6,0x7
+        vprord zmm7,zmm7,0x7
+        vprord zmm4,zmm4,0x7
+        vpaddd zmm0,zmm0,zmm27
+        vpaddd zmm1,zmm1,zmm21
+        vpaddd zmm2,zmm2,zmm17
+        vpaddd zmm3,zmm3,zmm24
+        vpaddd zmm0,zmm0,zmm4
+        vpaddd zmm1,zmm1,zmm5
+        vpaddd zmm2,zmm2,zmm6
+        vpaddd zmm3,zmm3,zmm7
+        vpxord zmm12,zmm12,zmm0
+        vpxord zmm13,zmm13,zmm1
+        vpxord zmm14,zmm14,zmm2
+        vpxord zmm15,zmm15,zmm3
+        vprord zmm12,zmm12,0x10
+        vprord zmm13,zmm13,0x10
+        vprord zmm14,zmm14,0x10
+        vprord zmm15,zmm15,0x10
+        vpaddd zmm8,zmm8,zmm12
+        vpaddd zmm9,zmm9,zmm13
+        vpaddd zmm10,zmm10,zmm14
+        vpaddd zmm11,zmm11,zmm15
+        vpxord zmm4,zmm4,zmm8
+        vpxord zmm5,zmm5,zmm9
+        vpxord zmm6,zmm6,zmm10
+        vpxord zmm7,zmm7,zmm11
+        vprord zmm4,zmm4,0xc
+        vprord zmm5,zmm5,0xc
+        vprord zmm6,zmm6,0xc
+        vprord zmm7,zmm7,0xc
+        vpaddd zmm0,zmm0,zmm31
+        vpaddd zmm1,zmm1,zmm16
+        vpaddd zmm2,zmm2,zmm25
+        vpaddd zmm3,zmm3,zmm22
+        vpaddd zmm0,zmm0,zmm4
+        vpaddd zmm1,zmm1,zmm5
+        vpaddd zmm2,zmm2,zmm6
+        vpaddd zmm3,zmm3,zmm7
+        vpxord zmm12,zmm12,zmm0
+        vpxord zmm13,zmm13,zmm1
+        vpxord zmm14,zmm14,zmm2
+        vpxord zmm15,zmm15,zmm3
+        vprord zmm12,zmm12,0x8
+        vprord zmm13,zmm13,0x8
+        vprord zmm14,zmm14,0x8
+        vprord zmm15,zmm15,0x8
+        vpaddd zmm8,zmm8,zmm12
+        vpaddd zmm9,zmm9,zmm13
+        vpaddd zmm10,zmm10,zmm14
+        vpaddd zmm11,zmm11,zmm15
+        vpxord zmm4,zmm4,zmm8
+        vpxord zmm5,zmm5,zmm9
+        vpxord zmm6,zmm6,zmm10
+        vpxord zmm7,zmm7,zmm11
+        vprord zmm4,zmm4,0x7
+        vprord zmm5,zmm5,0x7
+        vprord zmm6,zmm6,0x7
+        vprord zmm7,zmm7,0x7
+        vpaddd zmm0,zmm0,zmm30
+        vpaddd zmm1,zmm1,zmm18
+        vpaddd zmm2,zmm2,zmm19
+        vpaddd zmm3,zmm3,zmm23
+        vpaddd zmm0,zmm0,zmm5
+        vpaddd zmm1,zmm1,zmm6
+        vpaddd zmm2,zmm2,zmm7
+        vpaddd zmm3,zmm3,zmm4
+        vpxord zmm15,zmm15,zmm0
+        vpxord zmm12,zmm12,zmm1
+        vpxord zmm13,zmm13,zmm2
+        vpxord zmm14,zmm14,zmm3
+        vprord zmm15,zmm15,0x10
+        vprord zmm12,zmm12,0x10
+        vprord zmm13,zmm13,0x10
+        vprord zmm14,zmm14,0x10
+        vpaddd zmm10,zmm10,zmm15
+        vpaddd zmm11,zmm11,zmm12
+        vpaddd zmm8,zmm8,zmm13
+        vpaddd zmm9,zmm9,zmm14
+        vpxord zmm5,zmm5,zmm10
+        vpxord zmm6,zmm6,zmm11
+        vpxord zmm7,zmm7,zmm8
+        vpxord zmm4,zmm4,zmm9
+        vprord zmm5,zmm5,0xc
+        vprord zmm6,zmm6,0xc
+        vprord zmm7,zmm7,0xc
+        vprord zmm4,zmm4,0xc
+        vpaddd zmm0,zmm0,zmm26
+        vpaddd zmm1,zmm1,zmm28
+        vpaddd zmm2,zmm2,zmm20
+        vpaddd zmm3,zmm3,zmm29
+        vpaddd zmm0,zmm0,zmm5
+        vpaddd zmm1,zmm1,zmm6
+        vpaddd zmm2,zmm2,zmm7
+        vpaddd zmm3,zmm3,zmm4
+        vpxord zmm15,zmm15,zmm0
+        vpxord zmm12,zmm12,zmm1
+        vpxord zmm13,zmm13,zmm2
+        vpxord zmm14,zmm14,zmm3
+        vprord zmm15,zmm15,0x8
+        vprord zmm12,zmm12,0x8
+        vprord zmm13,zmm13,0x8
+        vprord zmm14,zmm14,0x8
+        vpaddd zmm10,zmm10,zmm15
+        vpaddd zmm11,zmm11,zmm12
+        vpaddd zmm8,zmm8,zmm13
+        vpaddd zmm9,zmm9,zmm14
+        vpxord zmm5,zmm5,zmm10
+        vpxord zmm6,zmm6,zmm11
+        vpxord zmm7,zmm7,zmm8
+        vpxord zmm4,zmm4,zmm9
+        vprord zmm5,zmm5,0x7
+        vprord zmm6,zmm6,0x7
+        vprord zmm7,zmm7,0x7
+        vprord zmm4,zmm4,0x7
+        vpxord zmm0,zmm0,zmm8
+        vpxord zmm1,zmm1,zmm9
+        vpxord zmm2,zmm2,zmm10
+        vpxord zmm3,zmm3,zmm11
+        vpxord zmm4,zmm4,zmm12
+        vpxord zmm5,zmm5,zmm13
+        vpxord zmm6,zmm6,zmm14
+        vpxord zmm7,zmm7,zmm15
+        vpxord zmm8,zmm8,DWORD PTR [rdi]{1to16}
+        vpxord zmm9,zmm9,DWORD PTR [rdi+0x4]{1to16}
+        vpxord zmm10,zmm10,DWORD PTR [rdi+0x8]{1to16}
+        vpxord zmm11,zmm11,DWORD PTR [rdi+0xc]{1to16}
+        vpxord zmm12,zmm12,DWORD PTR [rdi+0x10]{1to16}
+        vpxord zmm13,zmm13,DWORD PTR [rdi+0x14]{1to16}
+        vpxord zmm14,zmm14,DWORD PTR [rdi+0x18]{1to16}
+        vpxord zmm15,zmm15,DWORD PTR [rdi+0x1c]{1to16}
+        vpunpckldq zmm16,zmm0,zmm1
+        vpunpckhdq zmm17,zmm0,zmm1
+        vpunpckldq zmm18,zmm2,zmm3
+        vpunpckhdq zmm19,zmm2,zmm3
+        vpunpckldq zmm20,zmm4,zmm5
+        vpunpckhdq zmm21,zmm4,zmm5
+        vpunpckldq zmm22,zmm6,zmm7
+        vpunpckhdq zmm23,zmm6,zmm7
+        vpunpckldq zmm24,zmm8,zmm9
+        vpunpckhdq zmm25,zmm8,zmm9
+        vpunpckldq zmm26,zmm10,zmm11
+        vpunpckhdq zmm27,zmm10,zmm11
+        vpunpckldq zmm28,zmm12,zmm13
+        vpunpckhdq zmm29,zmm12,zmm13
+        vpunpckldq zmm30,zmm14,zmm15
+        vpunpckhdq zmm31,zmm14,zmm15
+        vpunpcklqdq zmm0,zmm16,zmm18
+        vpunpckhqdq zmm1,zmm16,zmm18
+        vpunpcklqdq zmm2,zmm17,zmm19
+        vpunpckhqdq zmm3,zmm17,zmm19
+        vpunpcklqdq zmm4,zmm20,zmm22
+        vpunpckhqdq zmm5,zmm20,zmm22
+        vpunpcklqdq zmm6,zmm21,zmm23
+        vpunpckhqdq zmm7,zmm21,zmm23
+        vpunpcklqdq zmm8,zmm24,zmm26
+        vpunpckhqdq zmm9,zmm24,zmm26
+        vpunpcklqdq zmm10,zmm25,zmm27
+        vpunpckhqdq zmm11,zmm25,zmm27
+        vpunpcklqdq zmm12,zmm28,zmm30
+        vpunpckhqdq zmm13,zmm28,zmm30
+        vpunpcklqdq zmm14,zmm29,zmm31
+        vpunpckhqdq zmm15,zmm29,zmm31
+        vshufi32x4 zmm16,zmm0,zmm4,0x88
+        vshufi32x4 zmm17,zmm1,zmm5,0x88
+        vshufi32x4 zmm18,zmm2,zmm6,0x88
+        vshufi32x4 zmm19,zmm3,zmm7,0x88
+        vshufi32x4 zmm20,zmm0,zmm4,0xdd
+        vshufi32x4 zmm21,zmm1,zmm5,0xdd
+        vshufi32x4 zmm22,zmm2,zmm6,0xdd
+        vshufi32x4 zmm23,zmm3,zmm7,0xdd
+        vshufi32x4 zmm24,zmm8,zmm12,0x88
+        vshufi32x4 zmm25,zmm9,zmm13,0x88
+        vshufi32x4 zmm26,zmm10,zmm14,0x88
+        vshufi32x4 zmm27,zmm11,zmm15,0x88
+        vshufi32x4 zmm28,zmm8,zmm12,0xdd
+        vshufi32x4 zmm29,zmm9,zmm13,0xdd
+        vshufi32x4 zmm30,zmm10,zmm14,0xdd
+        vshufi32x4 zmm31,zmm11,zmm15,0xdd
+        vshufi32x4 zmm0,zmm16,zmm24,0x88
+        vshufi32x4 zmm1,zmm17,zmm25,0x88
+        vshufi32x4 zmm2,zmm18,zmm26,0x88
+        vshufi32x4 zmm3,zmm19,zmm27,0x88
+        vshufi32x4 zmm4,zmm20,zmm28,0x88
+        vshufi32x4 zmm5,zmm21,zmm29,0x88
+        vshufi32x4 zmm6,zmm22,zmm30,0x88
+        vshufi32x4 zmm7,zmm23,zmm31,0x88
+        vshufi32x4 zmm8,zmm16,zmm24,0xdd
+        vshufi32x4 zmm9,zmm17,zmm25,0xdd
+        vshufi32x4 zmm10,zmm18,zmm26,0xdd
+        vshufi32x4 zmm11,zmm19,zmm27,0xdd
+        vshufi32x4 zmm12,zmm20,zmm28,0xdd
+        vshufi32x4 zmm13,zmm21,zmm29,0xdd
+        vshufi32x4 zmm14,zmm22,zmm30,0xdd
+        vshufi32x4 zmm15,zmm23,zmm31,0xdd
+        vmovdqu32 ZMMWORD PTR [r9],zmm0
+        vmovdqu32 ZMMWORD PTR [r9+0x40],zmm1
+        vmovdqu32 ZMMWORD PTR [r9+0x80],zmm2
+        vmovdqu32 ZMMWORD PTR [r9+0xc0],zmm3
+        vmovdqu32 ZMMWORD PTR [r9+0x100],zmm4
+        vmovdqu32 ZMMWORD PTR [r9+0x140],zmm5
+        vmovdqu32 ZMMWORD PTR [r9+0x180],zmm6
+        vmovdqu32 ZMMWORD PTR [r9+0x1c0],zmm7
+        vmovdqu32 ZMMWORD PTR [r9+0x200],zmm8
+        vmovdqu32 ZMMWORD PTR [r9+0x240],zmm9
+        vmovdqu32 ZMMWORD PTR [r9+0x280],zmm10
+        vmovdqu32 ZMMWORD PTR [r9+0x2c0],zmm11
+        vmovdqu32 ZMMWORD PTR [r9+0x300],zmm12
+        vmovdqu32 ZMMWORD PTR [r9+0x340],zmm13
+        vmovdqu32 ZMMWORD PTR [r9+0x380],zmm14
+        vmovdqu32 ZMMWORD PTR [r9+0x3c0],zmm15
+        vmovdqa32 zmm0,ZMMWORD PTR [rsp]
+        vmovdqa32 zmm1,ZMMWORD PTR [rsp+0x40]
+        vpaddd zmm2,zmm0,DWORD PTR [ADD16+rip]{1to16}
+        vpcmpltud k1,zmm2,zmm0
+        vpaddd zmm1{k1},zmm1,DWORD PTR [ADD1+rip]{1to16}
+        vmovdqa32 ZMMWORD PTR [rsp],zmm2
+        vmovdqa32 ZMMWORD PTR [rsp+0x40],zmm1
+        add    r9,0x400
+        sub    r10,0x10
+        cmp    r10,0x10
+        jae    3b
+        test   r10,r10
+        jne    2f
+9:
+        vzeroupper 
+        mov    rsp,rbp
+        pop    rbp
+        ret    
+2:
+        test   r10,0x8
+        je     2f
+        vpbroadcastd ymm16,DWORD PTR [rsi]
+        vpbroadcastd ymm17,DWORD PTR [rsi+0x4]
+        vpbroadcastd ymm18,DWORD PTR [rsi+0x8]
+        vpbroadcastd ymm19,DWORD PTR [rsi+0xc]
+        vpbroadcastd ymm20,DWORD PTR [rsi+0x10]
+        vpbroadcastd ymm21,DWORD PTR [rsi+0x14]
+        vpbroadcastd ymm22,DWORD PTR [rsi+0x18]
+        vpbroadcastd ymm23,DWORD PTR [rsi+0x1c]
+        vpbroadcastd ymm24,DWORD PTR [rsi+0x20]
+        vpbroadcastd ymm25,DWORD PTR [rsi+0x24]
+        vpbroadcastd ymm26,DWORD PTR [rsi+0x28]
+        vpbroadcastd ymm27,DWORD PTR [rsi+0x2c]
+        vpbroadcastd ymm28,DWORD PTR [rsi+0x30]
+        vpbroadcastd ymm29,DWORD PTR [rsi+0x34]
+        vpbroadcastd ymm30,DWORD PTR [rsi+0x38]
+        vpbroadcastd ymm31,DWORD PTR [rsi+0x3c]
+        vpbroadcastd ymm0,DWORD PTR [rdi]
+        vpbroadcastd ymm1,DWORD PTR [rdi+0x4]
+        vpbroadcastd ymm2,DWORD PTR [rdi+0x8]
+        vpbroadcastd ymm3,DWORD PTR [rdi+0xc]
+        vpbroadcastd ymm4,DWORD PTR [rdi+0x10]
+        vpbroadcastd ymm5,DWORD PTR [rdi+0x14]
+        vpbroadcastd ymm6,DWORD PTR [rdi+0x18]
+        vpbroadcastd ymm7,DWORD PTR [rdi+0x1c]
+        vpbroadcastd ymm8,DWORD PTR [BLAKE3_IV_0+rip]
+        vpbroadcastd ymm9,DWORD PTR [BLAKE3_IV_1+rip]
+        vpbroadcastd ymm10,DWORD PTR [BLAKE3_IV_2+rip]
+        vpbroadcastd ymm11,DWORD PTR [BLAKE3_IV_3+rip]
+        vmovdqa ymm12,YMMWORD PTR [rsp]
+        vmovdqa ymm13,YMMWORD PTR [rsp+0x40]
+        vpbroadcastd ymm14,edx
+        vpbroadcastd ymm15,r8d
+        vpaddd ymm0,ymm0,ymm16
+        vpaddd ymm1,ymm1,ymm18
+        vpaddd ymm2,ymm2,ymm20
+        vpaddd ymm3,ymm3,ymm22
+        vpaddd ymm0,ymm0,ymm4
+        vpaddd ymm1,ymm1,ymm5
+        vpaddd ymm2,ymm2,ymm6
+        vpaddd ymm3,ymm3,ymm7
+        vpxord ymm12,ymm12,ymm0
+        vpxord ymm13,ymm13,ymm1
+        vpxord ymm14,ymm14,ymm2
+        vpxord ymm15,ymm15,ymm3
+        vprord ymm12,ymm12,0x10
+        vprord ymm13,ymm13,0x10
+        vprord ymm14,ymm14,0x10
+        vprord ymm15,ymm15,0x10
+        vpaddd ymm8,ymm8,ymm12
+        vpaddd ymm9,ymm9,ymm13
+        vpaddd ymm10,ymm10,ymm14
+        vpaddd ymm11,ymm11,ymm15
+        vpxord ymm4,ymm4,ymm8
+        vpxord ymm5,ymm5,ymm9
+        vpxord ymm6,ymm6,ymm10
+        vpxord ymm7,ymm7,ymm11
+        vprord ymm4,ymm4,0xc
+        vprord ymm5,ymm5,0xc
+        vprord ymm6,ymm6,0xc
+        vprord ymm7,ymm7,0xc
+        vpaddd ymm0,ymm0,ymm17
+        vpaddd ymm1,ymm1,ymm19
+        vpaddd ymm2,ymm2,ymm21
+        vpaddd ymm3,ymm3,ymm23
+        vpaddd ymm0,ymm0,ymm4
+        vpaddd ymm1,ymm1,ymm5
+        vpaddd ymm2,ymm2,ymm6
+        vpaddd ymm3,ymm3,ymm7
+        vpxord ymm12,ymm12,ymm0
+        vpxord ymm13,ymm13,ymm1
+        vpxord ymm14,ymm14,ymm2
+        vpxord ymm15,ymm15,ymm3
+        vprord ymm12,ymm12,0x8
+        vprord ymm13,ymm13,0x8
+        vprord ymm14,ymm14,0x8
+        vprord ymm15,ymm15,0x8
+        vpaddd ymm8,ymm8,ymm12
+        vpaddd ymm9,ymm9,ymm13
+        vpaddd ymm10,ymm10,ymm14
+        vpaddd ymm11,ymm11,ymm15
+        vpxord ymm4,ymm4,ymm8
+        vpxord ymm5,ymm5,ymm9
+        vpxord ymm6,ymm6,ymm10
+        vpxord ymm7,ymm7,ymm11
+        vprord ymm4,ymm4,0x7
+        vprord ymm5,ymm5,0x7
+        vprord ymm6,ymm6,0x7
+        vprord ymm7,ymm7,0x7
+        vpaddd ymm0,ymm0,ymm24
+        vpaddd ymm1,ymm1,ymm26
+        vpaddd ymm2,ymm2,ymm28
+        vpaddd ymm3,ymm3,ymm30
+        vpaddd ymm0,ymm0,ymm5
+        vpaddd ymm1,ymm1,ymm6
+        vpaddd ymm2,ymm2,ymm7
+        vpaddd ymm3,ymm3,ymm4
+        vpxord ymm15,ymm15,ymm0
+        vpxord ymm12,ymm12,ymm1
+        vpxord ymm13,ymm13,ymm2
+        vpxord ymm14,ymm14,ymm3
+        vprord ymm15,ymm15,0x10
+        vprord ymm12,ymm12,0x10
+        vprord ymm13,ymm13,0x10
+        vprord ymm14,ymm14,0x10
+        vpaddd ymm10,ymm10,ymm15
+        vpaddd ymm11,ymm11,ymm12
+        vpaddd ymm8,ymm8,ymm13
+        vpaddd ymm9,ymm9,ymm14
+        vpxord ymm5,ymm5,ymm10
+        vpxord ymm6,ymm6,ymm11
+        vpxord ymm7,ymm7,ymm8
+        vpxord ymm4,ymm4,ymm9
+        vprord ymm5,ymm5,0xc
+        vprord ymm6,ymm6,0xc
+        vprord ymm7,ymm7,0xc
+        vprord ymm4,ymm4,0xc
+        vpaddd ymm0,ymm0,ymm25
+        vpaddd ymm1,ymm1,ymm27
+        vpaddd ymm2,ymm2,ymm29
+        vpaddd ymm3,ymm3,ymm31
+        vpaddd ymm0,ymm0,ymm5
+        vpaddd ymm1,ymm1,ymm6
+        vpaddd ymm2,ymm2,ymm7
+        vpaddd ymm3,ymm3,ymm4
+        vpxord ymm15,ymm15,ymm0
+        vpxord ymm12,ymm12,ymm1
+        vpxord ymm13,ymm13,ymm2
+        vpxord ymm14,ymm14,ymm3
+        vprord ymm15,ymm15,0x8
+        vprord ymm12,ymm12,0x8
+        vprord ymm13,ymm13,0x8
+        vprord ymm14,ymm14,0x8
+        vpaddd ymm10,ymm10,ymm15
+        vpaddd ymm11,ymm11,ymm12
+        vpaddd ymm8,ymm8,ymm13
+        vpaddd ymm9,ymm9,ymm14
+        vpxord ymm5,ymm5,ymm10
+        vpxord ymm6,ymm6,ymm11
+        vpxord ymm7,ymm7,ymm8
+        vpxord ymm4,ymm4,ymm9
+        vprord ymm5,ymm5,0x7
+        vprord ymm6,ymm6,0x7
+        vprord ymm7,ymm7,0x7
+        vprord ymm4,ymm4,0x7
+        vpaddd ymm0,ymm0,ymm18
+        vpaddd ymm1,ymm1,ymm19
+        vpaddd ymm2,ymm2,ymm23
+        vpaddd ymm3,ymm3,ymm20
+        vpaddd ymm0,ymm0,ymm4
+        vpaddd ymm1,ymm1,ymm5
+        vpaddd ymm2,ymm2,ymm6
+        vpaddd ymm3,ymm3,ymm7
+        vpxord ymm12,ymm12,ymm0
+        vpxord ymm13,ymm13,ymm1
+        vpxord ymm14,ymm14,ymm2
+        vpxord ymm15,ymm15,ymm3
+        vprord ymm12,ymm12,0x10
+        vprord ymm13,ymm13,0x10
+        vprord ymm14,ymm14,0x10
+        vprord ymm15,ymm15,0x10
+        vpaddd ymm8,ymm8,ymm12
+        vpaddd ymm9,ymm9,ymm13
+        vpaddd ymm10,ymm10,ymm14
+        vpaddd ymm11,ymm11,ymm15
+        vpxord ymm4,ymm4,ymm8
+        vpxord ymm5,ymm5,ymm9
+        vpxord ymm6,ymm6,ymm10
+        vpxord ymm7,ymm7,ymm11
+        vprord ymm4,ymm4,0xc
+        vprord ymm5,ymm5,0xc
+        vprord ymm6,ymm6,0xc
+        vprord ymm7,ymm7,0xc
+        vpaddd ymm0,ymm0,ymm22
+        vpaddd ymm1,ymm1,ymm26
+        vpaddd ymm2,ymm2,ymm16
+        vpaddd ymm3,ymm3,ymm29
+        vpaddd ymm0,ymm0,ymm4
+        vpaddd ymm1,ymm1,ymm5
+        vpaddd ymm2,ymm2,ymm6
+        vpaddd ymm3,ymm3,ymm7
+        vpxord ymm12,ymm12,ymm0
+        vpxord ymm13,ymm13,ymm1
+        vpxord ymm14,ymm14,ymm2
+        vpxord ymm15,ymm15,ymm3
+        vprord ymm12,ymm12,0x8
+        vprord ymm13,ymm13,0x8
+        vprord ymm14,ymm14,0x8
+        vprord ymm15,ymm15,0x8
+        vpaddd ymm8,ymm8,ymm12
+        vpaddd ymm9,ymm9,ymm13
+        vpaddd ymm10,ymm10,ymm14
+        vpaddd ymm11,ymm11,ymm15
+        vpxord ymm4,ymm4,ymm8
+        vpxord ymm5,ymm5,ymm9
+        vpxord ymm6,ymm6,ymm10
+        vpxord ymm7,ymm7,ymm11
+        vprord ymm4,ymm4,0x7
+        vprord ymm5,ymm5,0x7
+        vprord ymm6,ymm6,0x7
+        vprord ymm7,ymm7,0x7
+        vpaddd ymm0,ymm0,ymm17
+        vpaddd ymm1,ymm1,ymm28
+        vpaddd ymm2,ymm2,ymm25
+        vpaddd ymm3,ymm3,ymm31
+        vpaddd ymm0,ymm0,ymm5
+        vpaddd ymm1,ymm1,ymm6
+        vpaddd ymm2,ymm2,ymm7
+        vpaddd ymm3,ymm3,ymm4
+        vpxord ymm15,ymm15,ymm0
+        vpxord ymm12,ymm12,ymm1
+        vpxord ymm13,ymm13,ymm2
+        vpxord ymm14,ymm14,ymm3
+        vprord ymm15,ymm15,0x10
+        vprord ymm12,ymm12,0x10
+        vprord ymm13,ymm13,0x10
+        vprord ymm14,ymm14,0x10
+        vpaddd ymm10,ymm10,ymm15
+        vpaddd ymm11,ymm11,ymm12
+        vpaddd ymm8,ymm8,ymm13
+        vpaddd ymm9,ymm9,ymm14
+        vpxord ymm5,ymm5,ymm10
+        vpxord ymm6,ymm6,ymm11
+        vpxord ymm7,ymm7,ymm8
+        vpxord ymm4,ymm4,ymm9
+        vprord ymm5,ymm5,0xc
+        vprord ymm6,ymm6,0xc
+        vprord ymm7,ymm7,0xc
+        vprord ymm4,ymm4,0xc
+        vpaddd ymm0,ymm0,ymm27
+        vpaddd ymm1,ymm1,ymm21
+        vpaddd ymm2,ymm2,ymm30
+        vpaddd ymm3,ymm3,ymm24
+        vpaddd ymm0,ymm0,ymm5
+        vpaddd ymm1,ymm1,ymm6
+        vpaddd ymm2,ymm2,ymm7
+        vpaddd ymm3,ymm3,ymm4
+        vpxord ymm15,ymm15,ymm0
+        vpxord ymm12,ymm12,ymm1
+        vpxord ymm13,ymm13,ymm2
+        vpxord ymm14,ymm14,ymm3
+        vprord ymm15,ymm15,0x8
+        vprord ymm12,ymm12,0x8
+        vprord ymm13,ymm13,0x8
+        vprord ymm14,ymm14,0x8
+        vpaddd ymm10,ymm10,ymm15
+        vpaddd ymm11,ymm11,ymm12
+        vpaddd ymm8,ymm8,ymm13
+        vpaddd ymm9,ymm9,ymm14
+        vpxord ymm5,ymm5,ymm10
+        vpxord ymm6,ymm6,ymm11
+        vpxord ymm7,ymm7,ymm8
+        vpxord ymm4,ymm4,ymm9
+        vprord ymm5,ymm5,0x7
+        vprord ymm6,ymm6,0x7
+        vprord ymm7,ymm7,0x7
+        vprord ymm4,ymm4,0x7
+        vpaddd ymm0,ymm0,ymm19
+        vpaddd ymm1,ymm1,ymm26
+        vpaddd ymm2,ymm2,ymm29
+        vpaddd ymm3,ymm3,ymm23
+        vpaddd ymm0,ymm0,ymm4
+        vpaddd ymm1,ymm1,ymm5
+        vpaddd ymm2,ymm2,ymm6
+        vpaddd ymm3,ymm3,ymm7
+        vpxord ymm12,ymm12,ymm0
+        vpxord ymm13,ymm13,ymm1
+        vpxord ymm14,ymm14,ymm2
+        vpxord ymm15,ymm15,ymm3
+        vprord ymm12,ymm12,0x10
+        vprord ymm13,ymm13,0x10
+        vprord ymm14,ymm14,0x10
+        vprord ymm15,ymm15,0x10
+        vpaddd ymm8,ymm8,ymm12
+        vpaddd ymm9,ymm9,ymm13
+        vpaddd ymm10,ymm10,ymm14
+        vpaddd ymm11,ymm11,ymm15
+        vpxord ymm4,ymm4,ymm8
+        vpxord ymm5,ymm5,ymm9
+        vpxord ymm6,ymm6,ymm10
+        vpxord ymm7,ymm7,ymm11
+        vprord ymm4,ymm4,0xc
+        vprord ymm5,ymm5,0xc
+        vprord ymm6,ymm6,0xc
+        vprord ymm7,ymm7,0xc
+        vpaddd ymm0,ymm0,ymm20
+        vpaddd ymm1,ymm1,ymm28
+        vpaddd ymm2,ymm2,ymm18
+        vpaddd ymm3,ymm3,ymm30
+        vpaddd ymm0,ymm0,ymm4
+        vpaddd ymm1,ymm1,ymm5
+        vpaddd ymm2,ymm2,ymm6
+        vpaddd ymm3,ymm3,ymm7
+        vpxord ymm12,ymm12,ymm0
+        vpxord ymm13,ymm13,ymm1
+        vpxord ymm14,ymm14,ymm2
+        vpxord ymm15,ymm15,ymm3
+        vprord ymm12,ymm12,0x8
+        vprord ymm13,ymm13,0x8
+        vprord ymm14,ymm14,0x8
+        vprord ymm15,ymm15,0x8
+        vpaddd ymm8,ymm8,ymm12
+        vpaddd ymm9,ymm9,ymm13
+        vpaddd ymm10,ymm10,ymm14
+        vpaddd ymm11,ymm11,ymm15
+        vpxord ymm4,ymm4,ymm8
+        vpxord ymm5,ymm5,ymm9
+        vpxord ymm6,ymm6,ymm10
+        vpxord ymm7,ymm7,ymm11
+        vprord ymm4,ymm4,0x7
+        vprord ymm5,ymm5,0x7
+        vprord ymm6,ymm6,0x7
+        vprord ymm7,ymm7,0x7
+        vpaddd ymm0,ymm0,ymm22
+        vpaddd ymm1,ymm1,ymm25
+        vpaddd ymm2,ymm2,ymm27
+        vpaddd ymm3,ymm3,ymm24
+        vpaddd ymm0,ymm0,ymm5
+        vpaddd ymm1,ymm1,ymm6
+        vpaddd ymm2,ymm2,ymm7
+        vpaddd ymm3,ymm3,ymm4
+        vpxord ymm15,ymm15,ymm0
+        vpxord ymm12,ymm12,ymm1
+        vpxord ymm13,ymm13,ymm2
+        vpxord ymm14,ymm14,ymm3
+        vprord ymm15,ymm15,0x10
+        vprord ymm12,ymm12,0x10
+        vprord ymm13,ymm13,0x10
+        vprord ymm14,ymm14,0x10
+        vpaddd ymm10,ymm10,ymm15
+        vpaddd ymm11,ymm11,ymm12
+        vpaddd ymm8,ymm8,ymm13
+        vpaddd ymm9,ymm9,ymm14
+        vpxord ymm5,ymm5,ymm10
+        vpxord ymm6,ymm6,ymm11
+        vpxord ymm7,ymm7,ymm8
+        vpxord ymm4,ymm4,ymm9
+        vprord ymm5,ymm5,0xc
+        vprord ymm6,ymm6,0xc
+        vprord ymm7,ymm7,0xc
+        vprord ymm4,ymm4,0xc
+        vpaddd ymm0,ymm0,ymm21
+        vpaddd ymm1,ymm1,ymm16
+        vpaddd ymm2,ymm2,ymm31
+        vpaddd ymm3,ymm3,ymm17
+        vpaddd ymm0,ymm0,ymm5
+        vpaddd ymm1,ymm1,ymm6
+        vpaddd ymm2,ymm2,ymm7
+        vpaddd ymm3,ymm3,ymm4
+        vpxord ymm15,ymm15,ymm0
+        vpxord ymm12,ymm12,ymm1
+        vpxord ymm13,ymm13,ymm2
+        vpxord ymm14,ymm14,ymm3
+        vprord ymm15,ymm15,0x8
+        vprord ymm12,ymm12,0x8
+        vprord ymm13,ymm13,0x8
+        vprord ymm14,ymm14,0x8
+        vpaddd ymm10,ymm10,ymm15
+        vpaddd ymm11,ymm11,ymm12
+        vpaddd ymm8,ymm8,ymm13
+        vpaddd ymm9,ymm9,ymm14
+        vpxord ymm5,ymm5,ymm10
+        vpxord ymm6,ymm6,ymm11
+        vpxord ymm7,ymm7,ymm8
+        vpxord ymm4,ymm4,ymm9
+        vprord ymm5,ymm5,0x7
+        vprord ymm6,ymm6,0x7
+        vprord ymm7,ymm7,0x7
+        vprord ymm4,ymm4,0x7
+        vpaddd ymm0,ymm0,ymm26
+        vpaddd ymm1,ymm1,ymm28
+        vpaddd ymm2,ymm2,ymm30
+        vpaddd ymm3,ymm3,ymm29
+        vpaddd ymm0,ymm0,ymm4
+        vpaddd ymm1,ymm1,ymm5
+        vpaddd ymm2,ymm2,ymm6
+        vpaddd ymm3,ymm3,ymm7
+        vpxord ymm12,ymm12,ymm0
+        vpxord ymm13,ymm13,ymm1
+        vpxord ymm14,ymm14,ymm2
+        vpxord ymm15,ymm15,ymm3
+        vprord ymm12,ymm12,0x10
+        vprord ymm13,ymm13,0x10
+        vprord ymm14,ymm14,0x10
+        vprord ymm15,ymm15,0x10
+        vpaddd ymm8,ymm8,ymm12
+        vpaddd ymm9,ymm9,ymm13
+        vpaddd ymm10,ymm10,ymm14
+        vpaddd ymm11,ymm11,ymm15
+        vpxord ymm4,ymm4,ymm8
+        vpxord ymm5,ymm5,ymm9
+        vpxord ymm6,ymm6,ymm10
+        vpxord ymm7,ymm7,ymm11
+        vprord ymm4,ymm4,0xc
+        vprord ymm5,ymm5,0xc
+        vprord ymm6,ymm6,0xc
+        vprord ymm7,ymm7,0xc
+        vpaddd ymm0,ymm0,ymm23
+        vpaddd ymm1,ymm1,ymm25
+        vpaddd ymm2,ymm2,ymm19
+        vpaddd ymm3,ymm3,ymm31
+        vpaddd ymm0,ymm0,ymm4
+        vpaddd ymm1,ymm1,ymm5
+        vpaddd ymm2,ymm2,ymm6
+        vpaddd ymm3,ymm3,ymm7
+        vpxord ymm12,ymm12,ymm0
+        vpxord ymm13,ymm13,ymm1
+        vpxord ymm14,ymm14,ymm2
+        vpxord ymm15,ymm15,ymm3
+        vprord ymm12,ymm12,0x8
+        vprord ymm13,ymm13,0x8
+        vprord ymm14,ymm14,0x8
+        vprord ymm15,ymm15,0x8
+        vpaddd ymm8,ymm8,ymm12
+        vpaddd ymm9,ymm9,ymm13
+        vpaddd ymm10,ymm10,ymm14
+        vpaddd ymm11,ymm11,ymm15
+        vpxord ymm4,ymm4,ymm8
+        vpxord ymm5,ymm5,ymm9
+        vpxord ymm6,ymm6,ymm10
+        vpxord ymm7,ymm7,ymm11
+        vprord ymm4,ymm4,0x7
+        vprord ymm5,ymm5,0x7
+        vprord ymm6,ymm6,0x7
+        vprord ymm7,ymm7,0x7
+        vpaddd ymm0,ymm0,ymm20
+        vpaddd ymm1,ymm1,ymm27
+        vpaddd ymm2,ymm2,ymm21
+        vpaddd ymm3,ymm3,ymm17
+        vpaddd ymm0,ymm0,ymm5
+        vpaddd ymm1,ymm1,ymm6
+        vpaddd ymm2,ymm2,ymm7
+        vpaddd ymm3,ymm3,ymm4
+        vpxord ymm15,ymm15,ymm0
+        vpxord ymm12,ymm12,ymm1
+        vpxord ymm13,ymm13,ymm2
+        vpxord ymm14,ymm14,ymm3
+        vprord ymm15,ymm15,0x10
+        vprord ymm12,ymm12,0x10
+        vprord ymm13,ymm13,0x10
+        vprord ymm14,ymm14,0x10
+        vpaddd ymm10,ymm10,ymm15
+        vpaddd ymm11,ymm11,ymm12
+        vpaddd ymm8,ymm8,ymm13
+        vpaddd ymm9,ymm9,ymm14
+        vpxord ymm5,ymm5,ymm10
+        vpxord ymm6,ymm6,ymm11
+        vpxord ymm7,ymm7,ymm8
+        vpxord ymm4,ymm4,ymm9
+        vprord ymm5,ymm5,0xc
+        vprord ymm6,ymm6,0xc
+        vprord ymm7,ymm7,0xc
+        vprord ymm4,ymm4,0xc
+        vpaddd ymm0,ymm0,ymm16
+        vpaddd ymm1,ymm1,ymm18
+        vpaddd ymm2,ymm2,ymm24
+        vpaddd ymm3,ymm3,ymm22
+        vpaddd ymm0,ymm0,ymm5
+        vpaddd ymm1,ymm1,ymm6
+        vpaddd ymm2,ymm2,ymm7
+        vpaddd ymm3,ymm3,ymm4
+        vpxord ymm15,ymm15,ymm0
+        vpxord ymm12,ymm12,ymm1
+        vpxord ymm13,ymm13,ymm2
+        vpxord ymm14,ymm14,ymm3
+        vprord ymm15,ymm15,0x8
+        vprord ymm12,ymm12,0x8
+        vprord ymm13,ymm13,0x8
+        vprord ymm14,ymm14,0x8
+        vpaddd ymm10,ymm10,ymm15
+        vpaddd ymm11,ymm11,ymm12
+        vpaddd ymm8,ymm8,ymm13
+        vpaddd ymm9,ymm9,ymm14
+        vpxord ymm5,ymm5,ymm10
+        vpxord ymm6,ymm6,ymm11
+        vpxord ymm7,ymm7,ymm8
+        vpxord ymm4,ymm4,ymm9
+        vprord ymm5,ymm5,0x7
+        vprord ymm6,ymm6,0x7
+        vprord ymm7,ymm7,0x7
+        vprord ymm4,ymm4,0x7
+        vpaddd ymm0,ymm0,ymm28
+        vpaddd ymm1,ymm1,ymm25
+        vpaddd ymm2,ymm2,ymm31
+        vpaddd ymm3,ymm3,ymm30
+        vpaddd ymm0,ymm0,ymm4
+        vpaddd ymm1,ymm1,ymm5
+        vpaddd ymm2,ymm2,ymm6
+        vpaddd ymm3,ymm3,ymm7
+        vpxord ymm12,ymm12,ymm0
+        vpxord ymm13,ymm13,ymm1
+        vpxord ymm14,ymm14,ymm2
+        vpxord ymm15,ymm15,ymm3
+        vprord ymm12,ymm12,0x10
+        vprord ymm13,ymm13,0x10
+        vprord ymm14,ymm14,0x10
+        vprord ymm15,ymm15,0x10
+        vpaddd ymm8,ymm8,ymm12
+        vpaddd ymm9,ymm9,ymm13
+        vpaddd ymm10,ymm10,ymm14
+        vpaddd ymm11,ymm11,ymm15
+        vpxord ymm4,ymm4,ymm8
+        vpxord ymm5,ymm5,ymm9
+        vpxord ymm6,ymm6,ymm10
+        vpxord ymm7,ymm7,ymm11
+        vprord ymm4,ymm4,0xc
+        vprord ymm5,ymm5,0xc
+        vprord ymm6,ymm6,0xc
+        vprord ymm7,ymm7,0xc
+        vpaddd ymm0,ymm0,ymm29
+        vpaddd ymm1,ymm1,ymm27
+        vpaddd ymm2,ymm2,ymm26
+        vpaddd ymm3,ymm3,ymm24
+        vpaddd ymm0,ymm0,ymm4
+        vpaddd ymm1,ymm1,ymm5
+        vpaddd ymm2,ymm2,ymm6
+        vpaddd ymm3,ymm3,ymm7
+        vpxord ymm12,ymm12,ymm0
+        vpxord ymm13,ymm13,ymm1
+        vpxord ymm14,ymm14,ymm2
+        vpxord ymm15,ymm15,ymm3
+        vprord ymm12,ymm12,0x8
+        vprord ymm13,ymm13,0x8
+        vprord ymm14,ymm14,0x8
+        vprord ymm15,ymm15,0x8
+        vpaddd ymm8,ymm8,ymm12
+        vpaddd ymm9,ymm9,ymm13
+        vpaddd ymm10,ymm10,ymm14
+        vpaddd ymm11,ymm11,ymm15
+        vpxord ymm4,ymm4,ymm8
+        vpxord ymm5,ymm5,ymm9
+        vpxord ymm6,ymm6,ymm10
+        vpxord ymm7,ymm7,ymm11
+        vprord ymm4,ymm4,0x7
+        vprord ymm5,ymm5,0x7
+        vprord ymm6,ymm6,0x7
+        vprord ymm7,ymm7,0x7
+        vpaddd ymm0,ymm0,ymm23
+        vpaddd ymm1,ymm1,ymm21
+        vpaddd ymm2,ymm2,ymm16
+        vpaddd ymm3,ymm3,ymm22
+        vpaddd ymm0,ymm0,ymm5
+        vpaddd ymm1,ymm1,ymm6
+        vpaddd ymm2,ymm2,ymm7
+        vpaddd ymm3,ymm3,ymm4
+        vpxord ymm15,ymm15,ymm0
+        vpxord ymm12,ymm12,ymm1
+        vpxord ymm13,ymm13,ymm2
+        vpxord ymm14,ymm14,ymm3
+        vprord ymm15,ymm15,0x10
+        vprord ymm12,ymm12,0x10
+        vprord ymm13,ymm13,0x10
+        vprord ymm14,ymm14,0x10
+        vpaddd ymm10,ymm10,ymm15
+        vpaddd ymm11,ymm11,ymm12
+        vpaddd ymm8,ymm8,ymm13
+        vpaddd ymm9,ymm9,ymm14
+        vpxord ymm5,ymm5,ymm10
+        vpxord ymm6,ymm6,ymm11
+        vpxord ymm7,ymm7,ymm8
+        vpxord ymm4,ymm4,ymm9
+        vprord ymm5,ymm5,0xc
+        vprord ymm6,ymm6,0xc
+        vprord ymm7,ymm7,0xc
+        vprord ymm4,ymm4,0xc
+        vpaddd ymm0,ymm0,ymm18
+        vpaddd ymm1,ymm1,ymm19
+        vpaddd ymm2,ymm2,ymm17
+        vpaddd ymm3,ymm3,ymm20
+        vpaddd ymm0,ymm0,ymm5
+        vpaddd ymm1,ymm1,ymm6
+        vpaddd ymm2,ymm2,ymm7
+        vpaddd ymm3,ymm3,ymm4
+        vpxord ymm15,ymm15,ymm0
+        vpxord ymm12,ymm12,ymm1
+        vpxord ymm13,ymm13,ymm2
+        vpxord ymm14,ymm14,ymm3
+        vprord ymm15,ymm15,0x8
+        vprord ymm12,ymm12,0x8
+        vprord ymm13,ymm13,0x8
+        vprord ymm14,ymm14,0x8
+        vpaddd ymm10,ymm10,ymm15
+        vpaddd ymm11,ymm11,ymm12
+        vpaddd ymm8,ymm8,ymm13
+        vpaddd ymm9,ymm9,ymm14
+        vpxord ymm5,ymm5,ymm10
+        vpxord ymm6,ymm6,ymm11
+        vpxord ymm7,ymm7,ymm8
+        vpxord ymm4,ymm4,ymm9
+        vprord ymm5,ymm5,0x7
+        vprord ymm6,ymm6,0x7
+        vprord ymm7,ymm7,0x7
+        vprord ymm4,ymm4,0x7
+        vpaddd ymm0,ymm0,ymm25
+        vpaddd ymm1,ymm1,ymm27
+        vpaddd ymm2,ymm2,ymm24
+        vpaddd ymm3,ymm3,ymm31
+        vpaddd ymm0,ymm0,ymm4
+        vpaddd ymm1,ymm1,ymm5
+        vpaddd ymm2,ymm2,ymm6
+        vpaddd ymm3,ymm3,ymm7
+        vpxord ymm12,ymm12,ymm0
+        vpxord ymm13,ymm13,ymm1
+        vpxord ymm14,ymm14,ymm2
+        vpxord ymm15,ymm15,ymm3
+        vprord ymm12,ymm12,0x10
+        vprord ymm13,ymm13,0x10
+        vprord ymm14,ymm14,0x10
+        vprord ymm15,ymm15,0x10
+        vpaddd ymm8,ymm8,ymm12
+        vpaddd ymm9,ymm9,ymm13
+        vpaddd ymm10,ymm10,ymm14
+        vpaddd ymm11,ymm11,ymm15
+        vpxord ymm4,ymm4,ymm8
+        vpxord ymm5,ymm5,ymm9
+        vpxord ymm6,ymm6,ymm10
+        vpxord ymm7,ymm7,ymm11
+        vprord ymm4,ymm4,0xc
+        vprord ymm5,ymm5,0xc
+        vprord ymm6,ymm6,0xc
+        vprord ymm7,ymm7,0xc
+        vpaddd ymm0,ymm0,ymm30
+        vpaddd ymm1,ymm1,ymm21
+        vpaddd ymm2,ymm2,ymm28
+        vpaddd ymm3,ymm3,ymm17
+        vpaddd ymm0,ymm0,ymm4
+        vpaddd ymm1,ymm1,ymm5
+        vpaddd ymm2,ymm2,ymm6
+        vpaddd ymm3,ymm3,ymm7
+        vpxord ymm12,ymm12,ymm0
+        vpxord ymm13,ymm13,ymm1
+        vpxord ymm14,ymm14,ymm2
+        vpxord ymm15,ymm15,ymm3
+        vprord ymm12,ymm12,0x8
+        vprord ymm13,ymm13,0x8
+        vprord ymm14,ymm14,0x8
+        vprord ymm15,ymm15,0x8
+        vpaddd ymm8,ymm8,ymm12
+        vpaddd ymm9,ymm9,ymm13
+        vpaddd ymm10,ymm10,ymm14
+        vpaddd ymm11,ymm11,ymm15
+        vpxord ymm4,ymm4,ymm8
+        vpxord ymm5,ymm5,ymm9
+        vpxord ymm6,ymm6,ymm10
+        vpxord ymm7,ymm7,ymm11
+        vprord ymm4,ymm4,0x7
+        vprord ymm5,ymm5,0x7
+        vprord ymm6,ymm6,0x7
+        vprord ymm7,ymm7,0x7
+        vpaddd ymm0,ymm0,ymm29
+        vpaddd ymm1,ymm1,ymm16
+        vpaddd ymm2,ymm2,ymm18
+        vpaddd ymm3,ymm3,ymm20
+        vpaddd ymm0,ymm0,ymm5
+        vpaddd ymm1,ymm1,ymm6
+        vpaddd ymm2,ymm2,ymm7
+        vpaddd ymm3,ymm3,ymm4
+        vpxord ymm15,ymm15,ymm0
+        vpxord ymm12,ymm12,ymm1
+        vpxord ymm13,ymm13,ymm2
+        vpxord ymm14,ymm14,ymm3
+        vprord ymm15,ymm15,0x10
+        vprord ymm12,ymm12,0x10
+        vprord ymm13,ymm13,0x10
+        vprord ymm14,ymm14,0x10
+        vpaddd ymm10,ymm10,ymm15
+        vpaddd ymm11,ymm11,ymm12
+        vpaddd ymm8,ymm8,ymm13
+        vpaddd ymm9,ymm9,ymm14
+        vpxord ymm5,ymm5,ymm10
+        vpxord ymm6,ymm6,ymm11
+        vpxord ymm7,ymm7,ymm8
+        vpxord ymm4,ymm4,ymm9
+        vprord ymm5,ymm5,0xc
+        vprord ymm6,ymm6,0xc
+        vprord ymm7,ymm7,0xc
+        vprord ymm4,ymm4,0xc
+        vpaddd ymm0,ymm0,ymm19
+        vpaddd ymm1,ymm1,ymm26
+        vpaddd ymm2,ymm2,ymm22
+        vpaddd ymm3,ymm3,ymm23
+        vpaddd ymm0,ymm0,ymm5
+        vpaddd ymm1,ymm1,ymm6
+        vpaddd ymm2,ymm2,ymm7
+        vpaddd ymm3,ymm3,ymm4
+        vpxord ymm15,ymm15,ymm0
+        vpxord ymm12,ymm12,ymm1
+        vpxord ymm13,ymm13,ymm2
+        vpxord ymm14,ymm14,ymm3
+        vprord ymm15,ymm15,0x8
+        vprord ymm12,ymm12,0x8
+        vprord ymm13,ymm13,0x8
+        vprord ymm14,ymm14,0x8
+        vpaddd ymm10,ymm10,ymm15
+        vpaddd ymm11,ymm11,ymm12
+        vpaddd ymm8,ymm8,ymm13
+        vpaddd ymm9,ymm9,ymm14
+        vpxord ymm5,ymm5,ymm10
+        vpxord ymm6,ymm6,ymm11
+        vpxord ymm7,ymm7,ymm8
+        vpxord ymm4,ymm4,ymm9
+        vprord ymm5,ymm5,0x7
+        vprord ymm6,ymm6,0x7
+        vprord ymm7,ymm7,0x7
+        vprord ymm4,ymm4,0x7
+        vpaddd ymm0,ymm0,ymm27
+        vpaddd ymm1,ymm1,ymm21
+        vpaddd ymm2,ymm2,ymm17
+        vpaddd ymm3,ymm3,ymm24
+        vpaddd ymm0,ymm0,ymm4
+        vpaddd ymm1,ymm1,ymm5
+        vpaddd ymm2,ymm2,ymm6
+        vpaddd ymm3,ymm3,ymm7
+        vpxord ymm12,ymm12,ymm0
+        vpxord ymm13,ymm13,ymm1
+        vpxord ymm14,ymm14,ymm2
+        vpxord ymm15,ymm15,ymm3
+        vprord ymm12,ymm12,0x10
+        vprord ymm13,ymm13,0x10
+        vprord ymm14,ymm14,0x10
+        vprord ymm15,ymm15,0x10
+        vpaddd ymm8,ymm8,ymm12
+        vpaddd ymm9,ymm9,ymm13
+        vpaddd ymm10,ymm10,ymm14
+        vpaddd ymm11,ymm11,ymm15
+        vpxord ymm4,ymm4,ymm8
+        vpxord ymm5,ymm5,ymm9
+        vpxord ymm6,ymm6,ymm10
+        vpxord ymm7,ymm7,ymm11
+        vprord ymm4,ymm4,0xc
+        vprord ymm5,ymm5,0xc
+        vprord ymm6,ymm6,0xc
+        vprord ymm7,ymm7,0xc
+        vpaddd ymm0,ymm0,ymm31
+        vpaddd ymm1,ymm1,ymm16
+        vpaddd ymm2,ymm2,ymm25
+        vpaddd ymm3,ymm3,ymm22
+        vpaddd ymm0,ymm0,ymm4
+        vpaddd ymm1,ymm1,ymm5
+        vpaddd ymm2,ymm2,ymm6
+        vpaddd ymm3,ymm3,ymm7
+        vpxord ymm12,ymm12,ymm0
+        vpxord ymm13,ymm13,ymm1
+        vpxord ymm14,ymm14,ymm2
+        vpxord ymm15,ymm15,ymm3
+        vprord ymm12,ymm12,0x8
+        vprord ymm13,ymm13,0x8
+        vprord ymm14,ymm14,0x8
+        vprord ymm15,ymm15,0x8
+        vpaddd ymm8,ymm8,ymm12
+        vpaddd ymm9,ymm9,ymm13
+        vpaddd ymm10,ymm10,ymm14
+        vpaddd ymm11,ymm11,ymm15
+        vpxord ymm4,ymm4,ymm8
+        vpxord ymm5,ymm5,ymm9
+        vpxord ymm6,ymm6,ymm10
+        vpxord ymm7,ymm7,ymm11
+        vprord ymm4,ymm4,0x7
+        vprord ymm5,ymm5,0x7
+        vprord ymm6,ymm6,0x7
+        vprord ymm7,ymm7,0x7
+        vpaddd ymm0,ymm0,ymm30
+        vpaddd ymm1,ymm1,ymm18
+        vpaddd ymm2,ymm2,ymm19
+        vpaddd ymm3,ymm3,ymm23
+        vpaddd ymm0,ymm0,ymm5
+        vpaddd ymm1,ymm1,ymm6
+        vpaddd ymm2,ymm2,ymm7
+        vpaddd ymm3,ymm3,ymm4
+        vpxord ymm15,ymm15,ymm0
+        vpxord ymm12,ymm12,ymm1
+        vpxord ymm13,ymm13,ymm2
+        vpxord ymm14,ymm14,ymm3
+        vprord ymm15,ymm15,0x10
+        vprord ymm12,ymm12,0x10
+        vprord ymm13,ymm13,0x10
+        vprord ymm14,ymm14,0x10
+        vpaddd ymm10,ymm10,ymm15
+        vpaddd ymm11,ymm11,ymm12
+        vpaddd ymm8,ymm8,ymm13
+        vpaddd ymm9,ymm9,ymm14
+        vpxord ymm5,ymm5,ymm10
+        vpxord ymm6,ymm6,ymm11
+        vpxord ymm7,ymm7,ymm8
+        vpxord ymm4,ymm4,ymm9
+        vprord ymm5,ymm5,0xc
+        vprord ymm6,ymm6,0xc
+        vprord ymm7,ymm7,0xc
+        vprord ymm4,ymm4,0xc
+        vpaddd ymm0,ymm0,ymm26
+        vpaddd ymm1,ymm1,ymm28
+        vpaddd ymm2,ymm2,ymm20
+        vpaddd ymm3,ymm3,ymm29
+        vpaddd ymm0,ymm0,ymm5
+        vpaddd ymm1,ymm1,ymm6
+        vpaddd ymm2,ymm2,ymm7
+        vpaddd ymm3,ymm3,ymm4
+        vpxord ymm15,ymm15,ymm0
+        vpxord ymm12,ymm12,ymm1
+        vpxord ymm13,ymm13,ymm2
+        vpxord ymm14,ymm14,ymm3
+        vprord ymm15,ymm15,0x8
+        vprord ymm12,ymm12,0x8
+        vprord ymm13,ymm13,0x8
+        vprord ymm14,ymm14,0x8
+        vpaddd ymm10,ymm10,ymm15
+        vpaddd ymm11,ymm11,ymm12
+        vpaddd ymm8,ymm8,ymm13
+        vpaddd ymm9,ymm9,ymm14
+        vpxord ymm5,ymm5,ymm10
+        vpxord ymm6,ymm6,ymm11
+        vpxord ymm7,ymm7,ymm8
+        vpxord ymm4,ymm4,ymm9
+        vprord ymm5,ymm5,0x7
+        vprord ymm6,ymm6,0x7
+        vprord ymm7,ymm7,0x7
+        vprord ymm4,ymm4,0x7
+        vpxor  ymm0,ymm0,ymm8
+        vpxor  ymm1,ymm1,ymm9
+        vpxor  ymm2,ymm2,ymm10
+        vpxor  ymm3,ymm3,ymm11
+        vpxor  ymm4,ymm4,ymm12
+        vpxor  ymm5,ymm5,ymm13
+        vpxor  ymm6,ymm6,ymm14
+        vpxor  ymm7,ymm7,ymm15
+        vpxord ymm8,ymm8,DWORD PTR [rdi]{1to8}
+        vpxord ymm9,ymm9,DWORD PTR [rdi+0x4]{1to8}
+        vpxord ymm10,ymm10,DWORD PTR [rdi+0x8]{1to8}
+        vpxord ymm11,ymm11,DWORD PTR [rdi+0xc]{1to8}
+        vpxord ymm12,ymm12,DWORD PTR [rdi+0x10]{1to8}
+        vpxord ymm13,ymm13,DWORD PTR [rdi+0x14]{1to8}
+        vpxord ymm14,ymm14,DWORD PTR [rdi+0x18]{1to8}
+        vpxord ymm15,ymm15,DWORD PTR [rdi+0x1c]{1to8}
+        vpunpckldq ymm16,ymm0,ymm1
+        vpunpckhdq ymm17,ymm0,ymm1
+        vpunpckldq ymm18,ymm2,ymm3
+        vpunpckhdq ymm19,ymm2,ymm3
+        vpunpckldq ymm20,ymm4,ymm5
+        vpunpckhdq ymm21,ymm4,ymm5
+        vpunpckldq ymm22,ymm6,ymm7
+        vpunpckhdq ymm23,ymm6,ymm7
+        vpunpckldq ymm24,ymm8,ymm9
+        vpunpckhdq ymm25,ymm8,ymm9
+        vpunpckldq ymm26,ymm10,ymm11
+        vpunpckhdq ymm27,ymm10,ymm11
+        vpunpckldq ymm28,ymm12,ymm13
+        vpunpckhdq ymm29,ymm12,ymm13
+        vpunpckldq ymm30,ymm14,ymm15
+        vpunpckhdq ymm31,ymm14,ymm15
+        vpunpcklqdq ymm0,ymm16,ymm18
+        vpunpckhqdq ymm1,ymm16,ymm18
+        vpunpcklqdq ymm2,ymm17,ymm19
+        vpunpckhqdq ymm3,ymm17,ymm19
+        vpunpcklqdq ymm4,ymm20,ymm22
+        vpunpckhqdq ymm5,ymm20,ymm22
+        vpunpcklqdq ymm6,ymm21,ymm23
+        vpunpckhqdq ymm7,ymm21,ymm23
+        vpunpcklqdq ymm8,ymm24,ymm26
+        vpunpckhqdq ymm9,ymm24,ymm26
+        vpunpcklqdq ymm10,ymm25,ymm27
+        vpunpckhqdq ymm11,ymm25,ymm27
+        vpunpcklqdq ymm12,ymm28,ymm30
+        vpunpckhqdq ymm13,ymm28,ymm30
+        vpunpcklqdq ymm14,ymm29,ymm31
+        vpunpckhqdq ymm15,ymm29,ymm31
+        vshufi32x4 ymm16,ymm0,ymm4,0x0
+        vshufi32x4 ymm17,ymm8,ymm12,0x0
+        vshufi32x4 ymm18,ymm1,ymm5,0x0
+        vshufi32x4 ymm19,ymm9,ymm13,0x0
+        vshufi32x4 ymm20,ymm2,ymm6,0x0
+        vshufi32x4 ymm21,ymm10,ymm14,0x0
+        vshufi32x4 ymm22,ymm3,ymm7,0x0
+        vshufi32x4 ymm23,ymm11,ymm15,0x0
+        vshufi32x4 ymm24,ymm0,ymm4,0x3
+        vshufi32x4 ymm25,ymm8,ymm12,0x3
+        vshufi32x4 ymm26,ymm1,ymm5,0x3
+        vshufi32x4 ymm27,ymm9,ymm13,0x3
+        vshufi32x4 ymm28,ymm2,ymm6,0x3
+        vshufi32x4 ymm29,ymm10,ymm14,0x3
+        vshufi32x4 ymm30,ymm3,ymm7,0x3
+        vshufi32x4 ymm31,ymm11,ymm15,0x3
+        vmovdqu32 YMMWORD PTR [r9],ymm16
+        vmovdqu32 YMMWORD PTR [r9+0x20],ymm17
+        vmovdqu32 YMMWORD PTR [r9+0x40],ymm18
+        vmovdqu32 YMMWORD PTR [r9+0x60],ymm19
+        vmovdqu32 YMMWORD PTR [r9+0x80],ymm20
+        vmovdqu32 YMMWORD PTR [r9+0xa0],ymm21
+        vmovdqu32 YMMWORD PTR [r9+0xc0],ymm22
+        vmovdqu32 YMMWORD PTR [r9+0xe0],ymm23
+        vmovdqu32 YMMWORD PTR [r9+0x100],ymm24
+        vmovdqu32 YMMWORD PTR [r9+0x120],ymm25
+        vmovdqu32 YMMWORD PTR [r9+0x140],ymm26
+        vmovdqu32 YMMWORD PTR [r9+0x160],ymm27
+        vmovdqu32 YMMWORD PTR [r9+0x180],ymm28
+        vmovdqu32 YMMWORD PTR [r9+0x1a0],ymm29
+        vmovdqu32 YMMWORD PTR [r9+0x1c0],ymm30
+        vmovdqu32 YMMWORD PTR [r9+0x1e0],ymm31
+        vmovdqa ymm0,YMMWORD PTR [rsp+0x20]
+        vmovdqa ymm1,YMMWORD PTR [rsp+0x60]
+        vmovdqa YMMWORD PTR [rsp],ymm0
+        vmovdqa YMMWORD PTR [rsp+0x40],ymm1
+        add    r9,0x200
+        sub    r10,0x8
+2:
+        test   r10,0x4
+        je     2f
+        vbroadcasti32x4 zmm0,XMMWORD PTR [rdi]
+        vbroadcasti32x4 zmm1,XMMWORD PTR [rdi+0x10]
+        vbroadcasti32x4 zmm2,XMMWORD PTR [BLAKE3_IV+rip]
+        vmovdqa xmm12,XMMWORD PTR [rsp]
+        vmovdqa xmm13,XMMWORD PTR [rsp+0x40]
+        vpunpckldq xmm14,xmm12,xmm13
+        vpunpckhdq xmm15,xmm12,xmm13
+        vpermq ymm14,ymm14,0xdc
+        vpermq ymm15,ymm15,0xdc
+        vpbroadcastd zmm12,edx
+        vinserti64x4 zmm13,zmm14,ymm15,0x1
+        mov    eax,0x4444
+        kmovw  k2,eax
+        vpblendmd zmm13{k2},zmm13,zmm12
+        vpbroadcastd zmm15,r8d
+        mov    eax,0x8888
+        kmovw  k4,eax
+        vpblendmd zmm3{k4},zmm13,zmm15
+        mov    eax,0xaaaa
+        kmovw  k3,eax
+        vbroadcasti32x4 zmm8,XMMWORD PTR [rsi]
+        vbroadcasti32x4 zmm9,XMMWORD PTR [rsi+0x10]
+        vshufps zmm4,zmm8,zmm9,0x88
+        vshufps zmm5,zmm8,zmm9,0xdd
+        vbroadcasti32x4 zmm8,XMMWORD PTR [rsi+0x20]
+        vbroadcasti32x4 zmm9,XMMWORD PTR [rsi+0x30]
+        vshufps zmm6,zmm8,zmm9,0x88
+        vshufps zmm7,zmm8,zmm9,0xdd
+        vpshufd zmm6,zmm6,0x93
+        vpshufd zmm7,zmm7,0x93
+        mov    al,0x7
+3:
+        vpaddd zmm0,zmm0,zmm4
+        vpaddd zmm0,zmm0,zmm1
+        vpxord zmm3,zmm3,zmm0
+        vprord zmm3,zmm3,0x10
+        vpaddd zmm2,zmm2,zmm3
+        vpxord zmm1,zmm1,zmm2
+        vprord zmm1,zmm1,0xc
+        vpaddd zmm0,zmm0,zmm5
+        vpaddd zmm0,zmm0,zmm1
+        vpxord zmm3,zmm3,zmm0
+        vprord zmm3,zmm3,0x8
+        vpaddd zmm2,zmm2,zmm3
+        vpxord zmm1,zmm1,zmm2
+        vprord zmm1,zmm1,0x7
+        vpshufd zmm0,zmm0,0x93
+        vpshufd zmm3,zmm3,0x4e
+        vpshufd zmm2,zmm2,0x39
+        vpaddd zmm0,zmm0,zmm6
+        vpaddd zmm0,zmm0,zmm1
+        vpxord zmm3,zmm3,zmm0
+        vprord zmm3,zmm3,0x10
+        vpaddd zmm2,zmm2,zmm3
+        vpxord zmm1,zmm1,zmm2
+        vprord zmm1,zmm1,0xc
+        vpaddd zmm0,zmm0,zmm7
+        vpaddd zmm0,zmm0,zmm1
+        vpxord zmm3,zmm3,zmm0
+        vprord zmm3,zmm3,0x8
+        vpaddd zmm2,zmm2,zmm3
+        vpxord zmm1,zmm1,zmm2
+        vprord zmm1,zmm1,0x7
+        vpshufd zmm0,zmm0,0x39
+        vpshufd zmm3,zmm3,0x4e
+        vpshufd zmm2,zmm2,0x93
+        dec    al
+        je     3f
+        vshufps zmm8,zmm4,zmm5,0xd6
+        vpshufd zmm9,zmm4,0xf
+        vpshufd zmm4,zmm8,0x39
+        vshufps zmm8,zmm6,zmm7,0xfa
+        vpblendmd zmm9{k3},zmm9,zmm8
+        vpunpcklqdq zmm8,zmm7,zmm5
+        vpblendmd zmm8{k4},zmm8,zmm6
+        vpshufd zmm8,zmm8,0x78
+        vpunpckhdq zmm5,zmm5,zmm7
+        vpunpckldq zmm6,zmm6,zmm5
+        vpshufd zmm7,zmm6,0x1e
+        vmovdqa32 zmm5,zmm9
+        vmovdqa32 zmm6,zmm8
+        jmp    3b
+3:
+        vpxord zmm0,zmm0,zmm2
+        vpxord zmm1,zmm1,zmm3
+        vbroadcasti32x4 zmm8,XMMWORD PTR [rdi]
+        vbroadcasti32x4 zmm9,XMMWORD PTR [rdi+0x10]
+        vpxord zmm2,zmm2,zmm8
+        vpxord zmm3,zmm3,zmm9
+        vmovdqu XMMWORD PTR [r9],xmm0
+        vmovdqu XMMWORD PTR [r9+0x10],xmm1
+        vmovdqu XMMWORD PTR [r9+0x20],xmm2
+        vmovdqu XMMWORD PTR [r9+0x30],xmm3
+        vextracti128 XMMWORD PTR [r9+0x40],ymm0,0x1
+        vextracti128 XMMWORD PTR [r9+0x50],ymm1,0x1
+        vextracti128 XMMWORD PTR [r9+0x60],ymm2,0x1
+        vextracti128 XMMWORD PTR [r9+0x70],ymm3,0x1
+        vextracti32x4 XMMWORD PTR [r9+0x80],zmm0,0x2
+        vextracti32x4 XMMWORD PTR [r9+0x90],zmm1,0x2
+        vextracti32x4 XMMWORD PTR [r9+0xa0],zmm2,0x2
+        vextracti32x4 XMMWORD PTR [r9+0xb0],zmm3,0x2
+        vextracti32x4 XMMWORD PTR [r9+0xc0],zmm0,0x3
+        vextracti32x4 XMMWORD PTR [r9+0xd0],zmm1,0x3
+        vextracti32x4 XMMWORD PTR [r9+0xe0],zmm2,0x3
+        vextracti32x4 XMMWORD PTR [r9+0xf0],zmm3,0x3
+        vmovdqa xmm0,XMMWORD PTR [rsp+0x10]
+        vmovdqa xmm1,XMMWORD PTR [rsp+0x50]
+        vmovdqa XMMWORD PTR [rsp],xmm0
+        vmovdqa XMMWORD PTR [rsp+0x40],xmm1
+        add    r9,0x100
+        sub    r10,0x4
+2:
+        test   r10,0x2
+        je     2f
+        vbroadcasti128 ymm0,XMMWORD PTR [rdi]
+        vbroadcasti128 ymm1,XMMWORD PTR [rdi+0x10]
+        vmovd  xmm13,DWORD PTR [rsp]
+        vpinsrd xmm13,xmm13,DWORD PTR [rsp+0x40],0x1
+        vpinsrd xmm13,xmm13,edx,0x2
+        vmovd  xmm14,DWORD PTR [rsp+0x4]
+        vpinsrd xmm14,xmm14,DWORD PTR [rsp+0x44],0x1
+        vpinsrd xmm14,xmm14,edx,0x2
+        vinserti128 ymm13,ymm13,xmm14,0x1
+        vbroadcasti128 ymm2,XMMWORD PTR [BLAKE3_IV+rip]
+        vpbroadcastd ymm8,r8d
+        vpblendd ymm3,ymm13,ymm8,0x88
+        vbroadcasti128 ymm8,XMMWORD PTR [rsi]
+        vbroadcasti128 ymm9,XMMWORD PTR [rsi+0x10]
+        vshufps ymm4,ymm8,ymm9,0x88
+        vshufps ymm5,ymm8,ymm9,0xdd
+        vbroadcasti128 ymm8,XMMWORD PTR [rsi+0x20]
+        vbroadcasti128 ymm9,XMMWORD PTR [rsi+0x30]
+        vshufps ymm6,ymm8,ymm9,0x88
+        vshufps ymm7,ymm8,ymm9,0xdd
+        vpshufd ymm6,ymm6,0x93
+        vpshufd ymm7,ymm7,0x93
+        mov    al,0x7
+3:
+        vpaddd ymm0,ymm0,ymm4
+        vpaddd ymm0,ymm0,ymm1
+        vpxord ymm3,ymm3,ymm0
+        vprord ymm3,ymm3,0x10
+        vpaddd ymm2,ymm2,ymm3
+        vpxord ymm1,ymm1,ymm2
+        vprord ymm1,ymm1,0xc
+        vpaddd ymm0,ymm0,ymm5
+        vpaddd ymm0,ymm0,ymm1
+        vpxord ymm3,ymm3,ymm0
+        vprord ymm3,ymm3,0x8
+        vpaddd ymm2,ymm2,ymm3
+        vpxord ymm1,ymm1,ymm2
+        vprord ymm1,ymm1,0x7
+        vpshufd ymm0,ymm0,0x93
+        vpshufd ymm3,ymm3,0x4e
+        vpshufd ymm2,ymm2,0x39
+        vpaddd ymm0,ymm0,ymm6
+        vpaddd ymm0,ymm0,ymm1
+        vpxord ymm3,ymm3,ymm0
+        vprord ymm3,ymm3,0x10
+        vpaddd ymm2,ymm2,ymm3
+        vpxord ymm1,ymm1,ymm2
+        vprord ymm1,ymm1,0xc
+        vpaddd ymm0,ymm0,ymm7
+        vpaddd ymm0,ymm0,ymm1
+        vpxord ymm3,ymm3,ymm0
+        vprord ymm3,ymm3,0x8
+        vpaddd ymm2,ymm2,ymm3
+        vpxord ymm1,ymm1,ymm2
+        vprord ymm1,ymm1,0x7
+        vpshufd ymm0,ymm0,0x39
+        vpshufd ymm3,ymm3,0x4e
+        vpshufd ymm2,ymm2,0x93
+        dec    al
+        je     3f
+        vshufps ymm8,ymm4,ymm5,0xd6
+        vpshufd ymm9,ymm4,0xf
+        vpshufd ymm4,ymm8,0x39
+        vshufps ymm8,ymm6,ymm7,0xfa
+        vpblendd ymm9,ymm9,ymm8,0xaa
+        vpunpcklqdq ymm8,ymm7,ymm5
+        vpblendd ymm8,ymm8,ymm6,0x88
+        vpshufd ymm8,ymm8,0x78
+        vpunpckhdq ymm5,ymm5,ymm7
+        vpunpckldq ymm6,ymm6,ymm5
+        vpshufd ymm7,ymm6,0x1e
+        vmovdqa ymm5,ymm9
+        vmovdqa ymm6,ymm8
+        jmp    3b
+3:
+        vpxor  ymm0,ymm0,ymm2
+        vpxor  ymm1,ymm1,ymm3
+        vbroadcasti128 ymm8,XMMWORD PTR [rdi]
+        vbroadcasti128 ymm9,XMMWORD PTR [rdi+0x10]
+        vpxor  ymm2,ymm2,ymm8
+        vpxor  ymm3,ymm3,ymm9
+        vmovdqu XMMWORD PTR [r9],xmm0
+        vmovdqu XMMWORD PTR [r9+0x10],xmm1
+        vmovdqu XMMWORD PTR [r9+0x20],xmm2
+        vmovdqu XMMWORD PTR [r9+0x30],xmm3
+        vextracti128 XMMWORD PTR [r9+0x40],ymm0,0x1
+        vextracti128 XMMWORD PTR [r9+0x50],ymm1,0x1
+        vextracti128 XMMWORD PTR [r9+0x60],ymm2,0x1
+        vextracti128 XMMWORD PTR [r9+0x70],ymm3,0x1
+        vmovdqu xmm0,XMMWORD PTR [rsp+0x8]
+        vmovdqu xmm1,XMMWORD PTR [rsp+0x48]
+        vmovdqa XMMWORD PTR [rsp],xmm0
+        vmovdqa XMMWORD PTR [rsp+0x40],xmm1
+        add    r9,0x80
+        sub    r10,0x2
+2:
+        test   r10,0x1
+        je     9b
+        vmovdqu xmm0,XMMWORD PTR [rdi]
+        vmovdqu xmm1,XMMWORD PTR [rdi+0x10]
+        vmovd  xmm14,DWORD PTR [rsp]
+        vpinsrd xmm14,xmm14,DWORD PTR [rsp+0x40],0x1
+        vpinsrd xmm14,xmm14,edx,0x2
+        vmovdqa xmm2,XMMWORD PTR [BLAKE3_IV+rip]
+        vpinsrd xmm3,xmm14,r8d,0x3
+        vmovups xmm8,XMMWORD PTR [rsi]
+        vmovups xmm9,XMMWORD PTR [rsi+0x10]
+        vshufps xmm4,xmm8,xmm9,0x88
+        vshufps xmm5,xmm8,xmm9,0xdd
+        vmovups xmm8,XMMWORD PTR [rsi+0x20]
+        vmovups xmm9,XMMWORD PTR [rsi+0x30]
+        vshufps xmm6,xmm8,xmm9,0x88
+        vshufps xmm7,xmm8,xmm9,0xdd
+        vpshufd xmm6,xmm6,0x93
+        vpshufd xmm7,xmm7,0x93
+        mov    al,0x7
+3:
+        vpaddd xmm0,xmm0,xmm4
+        vpaddd xmm0,xmm0,xmm1
+        vpxord xmm3,xmm3,xmm0
+        vprord xmm3,xmm3,0x10
+        vpaddd xmm2,xmm2,xmm3
+        vpxord xmm1,xmm1,xmm2
+        vprord xmm1,xmm1,0xc
+        vpaddd xmm0,xmm0,xmm5
+        vpaddd xmm0,xmm0,xmm1
+        vpxord xmm3,xmm3,xmm0
+        vprord xmm3,xmm3,0x8
+        vpaddd xmm2,xmm2,xmm3
+        vpxord xmm1,xmm1,xmm2
+        vprord xmm1,xmm1,0x7
+        vpshufd xmm0,xmm0,0x93
+        vpshufd xmm3,xmm3,0x4e
+        vpshufd xmm2,xmm2,0x39
+        vpaddd xmm0,xmm0,xmm6
+        vpaddd xmm0,xmm0,xmm1
+        vpxord xmm3,xmm3,xmm0
+        vprord xmm3,xmm3,0x10
+        vpaddd xmm2,xmm2,xmm3
+        vpxord xmm1,xmm1,xmm2
+        vprord xmm1,xmm1,0xc
+        vpaddd xmm0,xmm0,xmm7
+        vpaddd xmm0,xmm0,xmm1
+        vpxord xmm3,xmm3,xmm0
+        vprord xmm3,xmm3,0x8
+        vpaddd xmm2,xmm2,xmm3
+        vpxord xmm1,xmm1,xmm2
+        vprord xmm1,xmm1,0x7
+        vpshufd xmm0,xmm0,0x39
+        vpshufd xmm3,xmm3,0x4e
+        vpshufd xmm2,xmm2,0x93
+        dec    al
+        je     3f
+        vshufps xmm8,xmm4,xmm5,0xd6
+        vpshufd xmm9,xmm4,0xf
+        vpshufd xmm4,xmm8,0x39
+        vshufps xmm8,xmm6,xmm7,0xfa
+        vpblendd xmm9,xmm9,xmm8,0xaa
+        vpunpcklqdq xmm8,xmm7,xmm5
+        vpblendd xmm8,xmm8,xmm6,0x88
+        vpshufd xmm8,xmm8,0x78
+        vpunpckhdq xmm5,xmm5,xmm7
+        vpunpckldq xmm6,xmm6,xmm5
+        vpshufd xmm7,xmm6,0x1e
+        vmovdqa xmm5,xmm9
+        vmovdqa xmm6,xmm8
+        jmp    3b
+3:
+        vpxor  xmm0,xmm0,xmm2
+        vpxor  xmm1,xmm1,xmm3
+        vpxor  xmm2,xmm2,XMMWORD PTR [rdi]
+        vpxor  xmm3,xmm3,XMMWORD PTR [rdi+0x10]
+        vmovdqu XMMWORD PTR [r9],xmm0
+        vmovdqu XMMWORD PTR [r9+0x10],xmm1
+        vmovdqu XMMWORD PTR [r9+0x20],xmm2
+        vmovdqu XMMWORD PTR [r9+0x30],xmm3
+        jmp    9b
+
+
 #ifdef __APPLE__
 .static_data
 #else
diff --git a/third-party/blake3/c/blake3_c_rust_bindings/src/lib.rs b/third-party/blake3/c/blake3_c_rust_bindings/src/lib.rs
index 41e4938bb0..ce7185ef3e 100644
--- a/third-party/blake3/c/blake3_c_rust_bindings/src/lib.rs
+++ b/third-party/blake3/c/blake3_c_rust_bindings/src/lib.rs
@@ -282,6 +282,16 @@ pub mod ffi {
                 flags_end: u8,
                 out: *mut u8,
             );
+            #[cfg(unix)]
+            pub fn blake3_xof_many_avx512(
+                cv: *const u32,
+                block: *const u8,
+                block_len: u8,
+                counter: u64,
+                flags: u8,
+                out: *mut u8,
+                outblocks: usize,
+            );
         }
     }
 
diff --git a/third-party/blake3/c/blake3_c_rust_bindings/src/test.rs b/third-party/blake3/c/blake3_c_rust_bindings/src/test.rs
index 0730d93062..2070886dfd 100644
--- a/third-party/blake3/c/blake3_c_rust_bindings/src/test.rs
+++ b/third-party/blake3/c/blake3_c_rust_bindings/src/test.rs
@@ -359,6 +359,105 @@ fn test_hash_many_neon() {
     test_hash_many_fn(crate::ffi::neon::blake3_hash_many_neon);
 }
 
+#[allow(unused)]
+type XofManyFunction = unsafe extern "C" fn(
+    cv: *const u32,
+    block: *const u8,
+    block_len: u8,
+    counter: u64,
+    flags: u8,
+    out: *mut u8,
+    outblocks: usize,
+);
+
+// A shared helper function for platform-specific tests.
+#[allow(unused)]
+pub fn test_xof_many_fn(xof_many_function: XofManyFunction) {
+    let mut block = [0; BLOCK_LEN];
+    let block_len = 42;
+    crate::test::paint_test_input(&mut block[..block_len]);
+    let cv = [40, 41, 42, 43, 44, 45, 46, 47];
+    let flags = KEYED_HASH;
+
+    // Test a few different initial counter values.
+    // - 0: The base case.
+    // - u32::MAX: The low word of the counter overflows for all inputs except the first.
+    // - i32::MAX: *No* overflow. But carry bugs in tricky SIMD code can screw this up, if you XOR
+    //   when you're supposed to ANDNOT...
+    let initial_counters = [0, u32::MAX as u64, i32::MAX as u64];
+    for counter in initial_counters {
+        dbg!(counter);
+
+        // 31 (16 + 8 + 4 + 2 + 1) outputs
+        const OUTPUT_SIZE: usize = 31 * BLOCK_LEN;
+
+        let mut portable_out = [0u8; OUTPUT_SIZE];
+        for (i, out_block) in portable_out.chunks_exact_mut(BLOCK_LEN).enumerate() {
+            unsafe {
+                crate::ffi::blake3_compress_xof_portable(
+                    cv.as_ptr(),
+                    block.as_ptr(),
+                    block_len as u8,
+                    counter + i as u64,
+                    flags,
+                    out_block.as_mut_ptr(),
+                );
+            }
+        }
+
+        let mut test_out = [0u8; OUTPUT_SIZE];
+        unsafe {
+            xof_many_function(
+                cv.as_ptr(),
+                block.as_ptr(),
+                block_len as u8,
+                counter,
+                flags,
+                test_out.as_mut_ptr(),
+                OUTPUT_SIZE / BLOCK_LEN,
+            );
+        }
+
+        assert_eq!(portable_out, test_out);
+    }
+
+    // Test that xof_many doesn't write more blocks than requested. Note that the current assembly
+    // implementation always outputs at least one block, so we don't test the zero case.
+    for block_count in 1..=32 {
+        let mut array = [0; BLOCK_LEN * 33];
+        let output_start = 17;
+        let output_len = block_count * BLOCK_LEN;
+        let output_end = output_start + output_len;
+        let output = &mut array[output_start..output_end];
+        unsafe {
+            xof_many_function(
+                cv.as_ptr(),
+                block.as_ptr(),
+                block_len as u8,
+                0,
+                flags,
+                output.as_mut_ptr(),
+                block_count,
+            );
+        }
+        for i in 0..array.len() {
+            if i < output_start || output_end <= i {
+                assert_eq!(0, array[i], "index {i}");
+            }
+        }
+    }
+}
+
+#[test]
+#[cfg(unix)]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn test_xof_many_avx512() {
+    if !crate::avx512_detected() {
+        return;
+    }
+    test_xof_many_fn(crate::ffi::x86::blake3_xof_many_avx512);
+}
+
 #[test]
 fn test_compare_reference_impl() {
     const OUT: usize = 303; // more than 64, not a multiple of 4
diff --git a/third-party/blake3/c/blake3_dispatch.c b/third-party/blake3/c/blake3_dispatch.c
index af6c3dadc7..f04f5a0877 100644
--- a/third-party/blake3/c/blake3_dispatch.c
+++ b/third-party/blake3/c/blake3_dispatch.c
@@ -4,9 +4,12 @@
 
 #include "blake3_impl.h"
 
-#if defined(IS_X86)
 #if defined(_MSC_VER)
 #include <Windows.h>
+#endif
+
+#if defined(IS_X86)
+#if defined(_MSC_VER)
 #include <intrin.h>
 #elif defined(__GNUC__)
 #include <immintrin.h>
@@ -220,6 +223,29 @@ void blake3_compress_xof(const uint32_t cv[8],
   blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
 }
 
+
+void blake3_xof_many(const uint32_t cv[8],
+                     const uint8_t block[BLAKE3_BLOCK_LEN],
+                     uint8_t block_len, uint64_t counter, uint8_t flags,
+                     uint8_t out[64], size_t outblocks) {
+  if (outblocks == 0) {
+    // The current assembly implementation always outputs at least 1 block.
+    return;
+  }
+#if defined(IS_X86)
+  const enum cpu_feature features = get_cpu_features();
+#if !defined(_WIN32) && !defined(BLAKE3_NO_AVX512)
+  if (features & AVX512VL) {
+    blake3_xof_many_avx512(cv, block, block_len, counter, flags, out, outblocks);
+    return;
+  }
+#endif
+#endif
+  for(size_t i = 0; i < outblocks; ++i) {
+    blake3_compress_xof(cv, block, block_len, counter + i, flags, out + 64*i);
+  }
+}
+
 void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
                       size_t blocks, const uint32_t key[8], uint64_t counter,
                       bool increment_counter, uint8_t flags,
diff --git a/third-party/blake3/c/blake3_impl.h b/third-party/blake3/c/blake3_impl.h
index beab5cf53c..51d792a899 100644
--- a/third-party/blake3/c/blake3_impl.h
+++ b/third-party/blake3/c/blake3_impl.h
@@ -28,7 +28,7 @@ enum blake3_flags {
 #define INLINE static inline __attribute__((always_inline))
 #endif
 
-#if defined(__x86_64__) || defined(_M_X64) 
+#if (defined(__x86_64__) || defined(_M_X64)) && !defined(_M_ARM64EC)
 #define IS_X86
 #define IS_X86_64
 #endif
@@ -38,7 +38,7 @@ enum blake3_flags {
 #define IS_X86_32
 #endif
 
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
 #define IS_AARCH64
 #endif
 
@@ -162,6 +162,13 @@ INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
   key_words[7] = load32(&key[7 * 4]);
 }
 
+INLINE void load_block_words(const uint8_t block[BLAKE3_BLOCK_LEN],
+                             uint32_t block_words[16]) {
+  for (size_t i = 0; i < 16; i++) {
+      block_words[i] = load32(&block[i * 4]);
+  }
+}
+
 INLINE void store32(void *dst, uint32_t w) {
   uint8_t *p = (uint8_t *)dst;
   p[0] = (uint8_t)(w >> 0);
@@ -191,6 +198,11 @@ void blake3_compress_xof(const uint32_t cv[8],
                          uint8_t block_len, uint64_t counter, uint8_t flags,
                          uint8_t out[64]);
 
+void blake3_xof_many(const uint32_t cv[8],
+                     const uint8_t block[BLAKE3_BLOCK_LEN],
+                     uint8_t block_len, uint64_t counter, uint8_t flags,
+                     uint8_t out[64], size_t outblocks);
+
 void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
                       size_t blocks, const uint32_t key[8], uint64_t counter,
                       bool increment_counter, uint8_t flags,
@@ -270,6 +282,13 @@ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
                              uint64_t counter, bool increment_counter,
                              uint8_t flags, uint8_t flags_start,
                              uint8_t flags_end, uint8_t *out);
+
+#if !defined(_WIN32)
+void blake3_xof_many_avx512(const uint32_t cv[8],
+                            const uint8_t block[BLAKE3_BLOCK_LEN],
+                            uint8_t block_len, uint64_t counter, uint8_t flags,
+                            uint8_t* out, size_t outblocks);
+#endif
 #endif
 #endif
 
diff --git a/third-party/blake3/c/blake3_neon.c b/third-party/blake3/c/blake3_neon.c
index 90bdd572ca..53ce83c3b6 100644
--- a/third-party/blake3/c/blake3_neon.c
+++ b/third-party/blake3/c/blake3_neon.c
@@ -34,7 +34,7 @@ INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
 }
 
 INLINE uint32x4_t rot16_128(uint32x4_t x) {
-  // The straightfoward implementation would be two shifts and an or, but that's
+  // The straightforward implementation would be two shifts and an or, but that's
   // slower on microarchitectures we've tested. See
   // https://github.com/BLAKE3-team/BLAKE3/pull/319.
   // return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
diff --git a/third-party/blake3/c/libblake3.pc.in b/third-party/blake3/c/libblake3.pc.in
index 9a5f21dca3..06f2c7a9b9 100644
--- a/third-party/blake3/c/libblake3.pc.in
+++ b/third-party/blake3/c/libblake3.pc.in
@@ -1,7 +1,7 @@
 prefix="@CMAKE_INSTALL_PREFIX@"
 exec_prefix="${prefix}"
-libdir="${prefix}/@CMAKE_INSTALL_LIBDIR@"
-includedir="${prefix}/@CMAKE_INSTALL_INCLUDEDIR@"
+libdir="@PKG_CONFIG_INSTALL_LIBDIR@"
+includedir="@PKG_CONFIG_INSTALL_INCLUDEDIR@"
 
 Name: @PROJECT_NAME@
 Description: @PROJECT_DESCRIPTION@
diff --git a/third-party/blake3/reference_impl/reference_impl.rs b/third-party/blake3/reference_impl/reference_impl.rs
index 72ad525c08..bc6138350c 100644
--- a/third-party/blake3/reference_impl/reference_impl.rs
+++ b/third-party/blake3/reference_impl/reference_impl.rs
@@ -78,23 +78,14 @@ fn compress(
     block_len: u32,
     flags: u32,
 ) -> [u32; 16] {
+    let counter_low = counter as u32;
+    let counter_high = (counter >> 32) as u32;
+    #[rustfmt::skip]
     let mut state = [
-        chaining_value[0],
-        chaining_value[1],
-        chaining_value[2],
-        chaining_value[3],
-        chaining_value[4],
-        chaining_value[5],
-        chaining_value[6],
-        chaining_value[7],
-        IV[0],
-        IV[1],
-        IV[2],
-        IV[3],
-        counter as u32,
-        (counter >> 32) as u32,
-        block_len,
-        flags,
+        chaining_value[0], chaining_value[1], chaining_value[2], chaining_value[3],
+        chaining_value[4], chaining_value[5], chaining_value[6], chaining_value[7],
+        IV[0],             IV[1],             IV[2],             IV[3],
+        counter_low,       counter_high,      block_len,         flags,
     ];
     let mut block = *block_words;
 
diff --git a/third-party/blake3/rust/guts/Cargo.toml b/third-party/blake3/rust/guts/Cargo.toml
deleted file mode 100644
index ebcf77fd71..0000000000
--- a/third-party/blake3/rust/guts/Cargo.toml
+++ /dev/null
@@ -1,18 +0,0 @@
-[package]
-name = "blake3_guts"
-version = "0.0.0"
-authors = ["Jack O'Connor <oconnor663@gmail.com>", "Samuel Neves"]
-description = "low-level building blocks for the BLAKE3 hash function"
-repository = "https://github.com/BLAKE3-team/BLAKE3"
-license = "CC0-1.0 OR Apache-2.0"
-documentation = "https://docs.rs/blake3_guts"
-readme = "readme.md"
-edition = "2021"
-
-[dev-dependencies]
-hex = "0.4.3"
-reference_impl = { path = "../../reference_impl" }
-
-[features]
-default = ["std"]
-std = []
diff --git a/third-party/blake3/rust/guts/readme.md b/third-party/blake3/rust/guts/readme.md
deleted file mode 100644
index 4957816df7..0000000000
--- a/third-party/blake3/rust/guts/readme.md
+++ /dev/null
@@ -1,80 +0,0 @@
-# The BLAKE3 Guts API
-
-## Introduction
-
-This [`blake3_guts`](https://crates.io/crates/blake3_guts) sub-crate contains
-low-level, high-performance, platform-specific implementations of the BLAKE3
-compression function. This API is complicated and unsafe, and this crate will
-never have a stable release. Most callers should instead use the
-[`blake3`](https://crates.io/crates/blake3) crate, which will eventually depend
-on this one internally.
-
-The code you see here (as of January 2024) is an early stage of a large planned
-refactor. The motivation for this refactor is a couple of missing features in
-both the Rust and C implementations:
-
-- The output side
-  ([`OutputReader`](https://docs.rs/blake3/latest/blake3/struct.OutputReader.html)
-  in Rust) doesn't take advantage of the most important SIMD optimizations that
-  compute multiple blocks in parallel. This blocks any project that wants to
-  use the BLAKE3 XOF as a stream cipher
-  ([[1]](https://github.com/oconnor663/bessie),
-  [[2]](https://github.com/oconnor663/blake3_aead)).
-- Low-level callers like [Bao](https://github.com/oconnor663/bao) that need
-  interior nodes of the tree also don't get those SIMD optimizations. They have
-  to use a slow, minimalistic, unstable, doc-hidden module [(also called
-  `guts`)](https://github.com/BLAKE3-team/BLAKE3/blob/master/src/guts.rs).
-
-The difficulty with adding those features is that they require changes to all
-of our optimized assembly and C intrinsics code. That's a couple dozen
-different files that are large, platform-specific, difficult to understand, and
-full of duplicated code. The higher-level Rust and C implementations of BLAKE3
-both depend on these files and will need to coordinate changes.
-
-At the same time, it won't be long before we add support for more platforms:
-
-- RISCV vector extensions
-- ARM SVE
-- WebAssembly SIMD
-
-It's important to get this refactor done before new platforms make it even
-harder to do.
-
-## The private guts API
-
-This is the API that each platform reimplements, so we want it to be as simple
-as possible apart from the high-performance work it needs to do. It's
-completely `unsafe`, and inputs and outputs are raw pointers that are allowed
-to alias (this matters for `hash_parents`, see below).
-
-- `degree`
-- `compress`
-    - The single compression function, for short inputs and odd-length tails.
-- `hash_chunks`
-- `hash_parents`
-- `xof`
-- `xof_xor`
-    - As `xof` but XOR'ing the result into the output buffer.
-- `universal_hash`
-    - This is a new construction specifically to support
-      [BLAKE3-AEAD](https://github.com/oconnor663/blake3_aead). Some
-      implementations might just stub it out with portable code.
-
-## The public guts API
-
-This is the API that this crate exposes to callers, i.e. to the main `blake3`
-crate. It's a thin, portable layer on top of the private API above. The Rust
-version of this API is memory-safe.
-
-- `degree`
-- `compress`
-- `hash_chunks`
-- `hash_parents`
-    - This handles most levels of the tree, where we keep hashing SIMD_DEGREE
-      parents at a time.
-- `reduce_parents`
-    - This uses the same `hash_parents` private API, but it handles the top
-      levels of the tree where we reduce in-place to the root parent node.
-- `xof`
-- `xof_xor`
-- `universal_hash`
diff --git a/third-party/blake3/rust/guts/src/lib.rs b/third-party/blake3/rust/guts/src/lib.rs
deleted file mode 100644
index e9b4914ba8..0000000000
--- a/third-party/blake3/rust/guts/src/lib.rs
+++ /dev/null
@@ -1,1000 +0,0 @@
-//! # The BLAKE3 Guts API
-//!
-//! See `readme.md`.
-//!
-//! The main entrypoint into this crate is [`DETECTED_IMPL`], which is a global [`Implementation`]
-//! that atomically initializes itself the first time you use it.
-//!
-//! # Example
-//!
-//! ```rust
-//! use blake3_guts::{TransposedVectors, DETECTED_IMPL, IV_BYTES, PARENT, ROOT};
-//!
-//! // Hash an input of exactly two chunks.
-//! let input = [0u8; 2048];
-//! let mut outputs = TransposedVectors::new();
-//! let (left_outputs, _) = DETECTED_IMPL.split_transposed_vectors(&mut outputs);
-//! DETECTED_IMPL.hash_chunks(
-//!     &input,
-//!     &IV_BYTES,
-//!     0, // counter
-//!     0, // flags
-//!     left_outputs,
-//! );
-//! let root_node = outputs.extract_parent_node(0);
-//! let hash = DETECTED_IMPL.compress(
-//!     &root_node,
-//!     64, // block_len
-//!     &IV_BYTES,
-//!     0, // counter
-//!     PARENT | ROOT,
-//! );
-//!
-//! // Compute the same hash using the reference implementation.
-//! let mut reference_hasher = reference_impl::Hasher::new();
-//! reference_hasher.update(&input);
-//! let mut expected_hash = [0u8; 32];
-//! reference_hasher.finalize(&mut expected_hash);
-//!
-//! assert_eq!(hash, expected_hash);
-//! ```
-
-// Tests always require libstd.
-#![cfg_attr(all(not(feature = "std"), not(test)), no_std)]
-
-use core::cmp;
-use core::marker::PhantomData;
-use core::mem;
-use core::ptr;
-use core::sync::atomic::{AtomicPtr, Ordering::Relaxed};
-
-pub mod portable;
-
-#[cfg(test)]
-mod test;
-
-pub const OUT_LEN: usize = 32;
-pub const BLOCK_LEN: usize = 64;
-pub const CHUNK_LEN: usize = 1024;
-pub const WORD_LEN: usize = 4;
-pub const UNIVERSAL_HASH_LEN: usize = 16;
-
-pub const CHUNK_START: u32 = 1 << 0;
-pub const CHUNK_END: u32 = 1 << 1;
-pub const PARENT: u32 = 1 << 2;
-pub const ROOT: u32 = 1 << 3;
-pub const KEYED_HASH: u32 = 1 << 4;
-pub const DERIVE_KEY_CONTEXT: u32 = 1 << 5;
-pub const DERIVE_KEY_MATERIAL: u32 = 1 << 6;
-
-pub const IV: CVWords = [
-    0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19,
-];
-pub const IV_BYTES: CVBytes = le_bytes_from_words_32(&IV);
-
-pub const MSG_SCHEDULE: [[usize; 16]; 7] = [
-    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
-    [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8],
-    [3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1],
-    [10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6],
-    [12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4],
-    [9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7],
-    [11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13],
-];
-
-// never less than 2
-pub const MAX_SIMD_DEGREE: usize = 2;
-
-pub type CVBytes = [u8; 32];
-pub type CVWords = [u32; 8];
-pub type BlockBytes = [u8; 64];
-pub type BlockWords = [u32; 16];
-
-pub static DETECTED_IMPL: Implementation = Implementation::new(
-    degree_init,
-    compress_init,
-    hash_chunks_init,
-    hash_parents_init,
-    xof_init,
-    xof_xor_init,
-    universal_hash_init,
-);
-
-fn detect() -> Implementation {
-    portable::implementation()
-}
-
-fn init_detected_impl() {
-    let detected = detect();
-
-    DETECTED_IMPL
-        .degree_ptr
-        .store(detected.degree_ptr.load(Relaxed), Relaxed);
-    DETECTED_IMPL
-        .compress_ptr
-        .store(detected.compress_ptr.load(Relaxed), Relaxed);
-    DETECTED_IMPL
-        .hash_chunks_ptr
-        .store(detected.hash_chunks_ptr.load(Relaxed), Relaxed);
-    DETECTED_IMPL
-        .hash_parents_ptr
-        .store(detected.hash_parents_ptr.load(Relaxed), Relaxed);
-    DETECTED_IMPL
-        .xof_ptr
-        .store(detected.xof_ptr.load(Relaxed), Relaxed);
-    DETECTED_IMPL
-        .xof_xor_ptr
-        .store(detected.xof_xor_ptr.load(Relaxed), Relaxed);
-    DETECTED_IMPL
-        .universal_hash_ptr
-        .store(detected.universal_hash_ptr.load(Relaxed), Relaxed);
-}
-
-pub struct Implementation {
-    degree_ptr: AtomicPtr<()>,
-    compress_ptr: AtomicPtr<()>,
-    hash_chunks_ptr: AtomicPtr<()>,
-    hash_parents_ptr: AtomicPtr<()>,
-    xof_ptr: AtomicPtr<()>,
-    xof_xor_ptr: AtomicPtr<()>,
-    universal_hash_ptr: AtomicPtr<()>,
-}
-
-impl Implementation {
-    const fn new(
-        degree_fn: DegreeFn,
-        compress_fn: CompressFn,
-        hash_chunks_fn: HashChunksFn,
-        hash_parents_fn: HashParentsFn,
-        xof_fn: XofFn,
-        xof_xor_fn: XofFn,
-        universal_hash_fn: UniversalHashFn,
-    ) -> Self {
-        Self {
-            degree_ptr: AtomicPtr::new(degree_fn as *mut ()),
-            compress_ptr: AtomicPtr::new(compress_fn as *mut ()),
-            hash_chunks_ptr: AtomicPtr::new(hash_chunks_fn as *mut ()),
-            hash_parents_ptr: AtomicPtr::new(hash_parents_fn as *mut ()),
-            xof_ptr: AtomicPtr::new(xof_fn as *mut ()),
-            xof_xor_ptr: AtomicPtr::new(xof_xor_fn as *mut ()),
-            universal_hash_ptr: AtomicPtr::new(universal_hash_fn as *mut ()),
-        }
-    }
-
-    #[inline]
-    fn degree_fn(&self) -> DegreeFn {
-        unsafe { mem::transmute(self.degree_ptr.load(Relaxed)) }
-    }
-
-    #[inline]
-    pub fn degree(&self) -> usize {
-        let degree = unsafe { self.degree_fn()() };
-        debug_assert!(degree >= 2);
-        debug_assert!(degree <= MAX_SIMD_DEGREE);
-        debug_assert_eq!(1, degree.count_ones(), "power of 2");
-        degree
-    }
-
-    #[inline]
-    pub fn split_transposed_vectors<'v>(
-        &self,
-        vectors: &'v mut TransposedVectors,
-    ) -> (TransposedSplit<'v>, TransposedSplit<'v>) {
-        unsafe { vectors.split(self.degree()) }
-    }
-
-    #[inline]
-    fn compress_fn(&self) -> CompressFn {
-        unsafe { mem::transmute(self.compress_ptr.load(Relaxed)) }
-    }
-
-    #[inline]
-    pub fn compress(
-        &self,
-        block: &BlockBytes,
-        block_len: u32,
-        cv: &CVBytes,
-        counter: u64,
-        flags: u32,
-    ) -> CVBytes {
-        let mut out = [0u8; 32];
-        unsafe {
-            self.compress_fn()(block, block_len, cv, counter, flags, &mut out);
-        }
-        out
-    }
-
-    // The contract for HashChunksFn doesn't require the implementation to support single-chunk
-    // inputs. Instead we handle that case here by calling compress in a loop.
-    #[inline]
-    fn hash_one_chunk(
-        &self,
-        mut input: &[u8],
-        key: &CVBytes,
-        counter: u64,
-        mut flags: u32,
-        output: TransposedSplit,
-    ) {
-        debug_assert!(input.len() <= CHUNK_LEN);
-        let mut cv = *key;
-        flags |= CHUNK_START;
-        while input.len() > BLOCK_LEN {
-            cv = self.compress(
-                input[..BLOCK_LEN].try_into().unwrap(),
-                BLOCK_LEN as u32,
-                &cv,
-                counter,
-                flags,
-            );
-            input = &input[BLOCK_LEN..];
-            flags &= !CHUNK_START;
-        }
-        let mut final_block = [0u8; BLOCK_LEN];
-        final_block[..input.len()].copy_from_slice(input);
-        cv = self.compress(
-            &final_block,
-            input.len() as u32,
-            &cv,
-            counter,
-            flags | CHUNK_END,
-        );
-        unsafe {
-            write_transposed_cv(&words_from_le_bytes_32(&cv), output.ptr);
-        }
-    }
-
-    #[inline]
-    fn hash_chunks_fn(&self) -> HashChunksFn {
-        unsafe { mem::transmute(self.hash_chunks_ptr.load(Relaxed)) }
-    }
-
-    #[inline]
-    pub fn hash_chunks(
-        &self,
-        input: &[u8],
-        key: &CVBytes,
-        counter: u64,
-        flags: u32,
-        transposed_output: TransposedSplit,
-    ) -> usize {
-        debug_assert!(input.len() <= self.degree() * CHUNK_LEN);
-        if input.len() <= CHUNK_LEN {
-            // The underlying hash_chunks_fn isn't required to support this case. Instead we handle
-            // it by calling compress_fn in a loop. But note that we still don't support root
-            // finalization or the empty input here.
-            self.hash_one_chunk(input, key, counter, flags, transposed_output);
-            return 1;
-        }
-        // SAFETY: If the caller passes in more than MAX_SIMD_DEGREE * CHUNK_LEN bytes, silently
-        // ignore the remainder. This makes it impossible to write out of bounds in a properly
-        // constructed TransposedSplit.
-        let len = cmp::min(input.len(), MAX_SIMD_DEGREE * CHUNK_LEN);
-        unsafe {
-            self.hash_chunks_fn()(
-                input.as_ptr(),
-                len,
-                key,
-                counter,
-                flags,
-                transposed_output.ptr,
-            );
-        }
-        if input.len() % CHUNK_LEN == 0 {
-            input.len() / CHUNK_LEN
-        } else {
-            (input.len() / CHUNK_LEN) + 1
-        }
-    }
-
-    #[inline]
-    fn hash_parents_fn(&self) -> HashParentsFn {
-        unsafe { mem::transmute(self.hash_parents_ptr.load(Relaxed)) }
-    }
-
-    #[inline]
-    pub fn hash_parents(
-        &self,
-        transposed_input: &TransposedVectors,
-        mut num_cvs: usize,
-        key: &CVBytes,
-        flags: u32,
-        transposed_output: TransposedSplit,
-    ) -> usize {
-        debug_assert!(num_cvs <= 2 * MAX_SIMD_DEGREE);
-        // SAFETY: Cap num_cvs at 2 * MAX_SIMD_DEGREE, to guarantee no out-of-bounds accesses.
-        num_cvs = cmp::min(num_cvs, 2 * MAX_SIMD_DEGREE);
-        let mut odd_cv = [0u32; 8];
-        if num_cvs % 2 == 1 {
-            unsafe {
-                odd_cv = read_transposed_cv(transposed_input.as_ptr().add(num_cvs - 1));
-            }
-        }
-        let num_parents = num_cvs / 2;
-        unsafe {
-            self.hash_parents_fn()(
-                transposed_input.as_ptr(),
-                num_parents,
-                key,
-                flags | PARENT,
-                transposed_output.ptr,
-            );
-        }
-        if num_cvs % 2 == 1 {
-            unsafe {
-                write_transposed_cv(&odd_cv, transposed_output.ptr.add(num_parents));
-            }
-            num_parents + 1
-        } else {
-            num_parents
-        }
-    }
-
-    #[inline]
-    pub fn reduce_parents(
-        &self,
-        transposed_in_out: &mut TransposedVectors,
-        mut num_cvs: usize,
-        key: &CVBytes,
-        flags: u32,
-    ) -> usize {
-        debug_assert!(num_cvs <= 2 * MAX_SIMD_DEGREE);
-        // SAFETY: Cap num_cvs at 2 * MAX_SIMD_DEGREE, to guarantee no out-of-bounds accesses.
-        num_cvs = cmp::min(num_cvs, 2 * MAX_SIMD_DEGREE);
-        let in_out_ptr = transposed_in_out.as_mut_ptr();
-        let mut odd_cv = [0u32; 8];
-        if num_cvs % 2 == 1 {
-            unsafe {
-                odd_cv = read_transposed_cv(in_out_ptr.add(num_cvs - 1));
-            }
-        }
-        let num_parents = num_cvs / 2;
-        unsafe {
-            self.hash_parents_fn()(in_out_ptr, num_parents, key, flags | PARENT, in_out_ptr);
-        }
-        if num_cvs % 2 == 1 {
-            unsafe {
-                write_transposed_cv(&odd_cv, in_out_ptr.add(num_parents));
-            }
-            num_parents + 1
-        } else {
-            num_parents
-        }
-    }
-
-    #[inline]
-    fn xof_fn(&self) -> XofFn {
-        unsafe { mem::transmute(self.xof_ptr.load(Relaxed)) }
-    }
-
-    #[inline]
-    pub fn xof(
-        &self,
-        block: &BlockBytes,
-        block_len: u32,
-        cv: &CVBytes,
-        mut counter: u64,
-        flags: u32,
-        mut out: &mut [u8],
-    ) {
-        let degree = self.degree();
-        let simd_len = degree * BLOCK_LEN;
-        while !out.is_empty() {
-            let take = cmp::min(simd_len, out.len());
-            unsafe {
-                self.xof_fn()(
-                    block,
-                    block_len,
-                    cv,
-                    counter,
-                    flags | ROOT,
-                    out.as_mut_ptr(),
-                    take,
-                );
-            }
-            out = &mut out[take..];
-            counter += degree as u64;
-        }
-    }
-
-    #[inline]
-    fn xof_xor_fn(&self) -> XofFn {
-        unsafe { mem::transmute(self.xof_xor_ptr.load(Relaxed)) }
-    }
-
-    #[inline]
-    pub fn xof_xor(
-        &self,
-        block: &BlockBytes,
-        block_len: u32,
-        cv: &CVBytes,
-        mut counter: u64,
-        flags: u32,
-        mut out: &mut [u8],
-    ) {
-        let degree = self.degree();
-        let simd_len = degree * BLOCK_LEN;
-        while !out.is_empty() {
-            let take = cmp::min(simd_len, out.len());
-            unsafe {
-                self.xof_xor_fn()(
-                    block,
-                    block_len,
-                    cv,
-                    counter,
-                    flags | ROOT,
-                    out.as_mut_ptr(),
-                    take,
-                );
-            }
-            out = &mut out[take..];
-            counter += degree as u64;
-        }
-    }
-
-    #[inline]
-    fn universal_hash_fn(&self) -> UniversalHashFn {
-        unsafe { mem::transmute(self.universal_hash_ptr.load(Relaxed)) }
-    }
-
-    #[inline]
-    pub fn universal_hash(&self, mut input: &[u8], key: &CVBytes, mut counter: u64) -> [u8; 16] {
-        let degree = self.degree();
-        let simd_len = degree * BLOCK_LEN;
-        let mut ret = [0u8; 16];
-        while !input.is_empty() {
-            let take = cmp::min(simd_len, input.len());
-            let mut output = [0u8; 16];
-            unsafe {
-                self.universal_hash_fn()(input.as_ptr(), take, key, counter, &mut output);
-            }
-            input = &input[take..];
-            counter += degree as u64;
-            for byte_index in 0..16 {
-                ret[byte_index] ^= output[byte_index];
-            }
-        }
-        ret
-    }
-}
-
-impl Clone for Implementation {
-    fn clone(&self) -> Self {
-        Self {
-            degree_ptr: AtomicPtr::new(self.degree_ptr.load(Relaxed)),
-            compress_ptr: AtomicPtr::new(self.compress_ptr.load(Relaxed)),
-            hash_chunks_ptr: AtomicPtr::new(self.hash_chunks_ptr.load(Relaxed)),
-            hash_parents_ptr: AtomicPtr::new(self.hash_parents_ptr.load(Relaxed)),
-            xof_ptr: AtomicPtr::new(self.xof_ptr.load(Relaxed)),
-            xof_xor_ptr: AtomicPtr::new(self.xof_xor_ptr.load(Relaxed)),
-            universal_hash_ptr: AtomicPtr::new(self.universal_hash_ptr.load(Relaxed)),
-        }
-    }
-}
-
-// never less than 2
-type DegreeFn = unsafe extern "C" fn() -> usize;
-
-unsafe extern "C" fn degree_init() -> usize {
-    init_detected_impl();
-    DETECTED_IMPL.degree_fn()()
-}
-
-type CompressFn = unsafe extern "C" fn(
-    block: *const BlockBytes, // zero padded to 64 bytes
-    block_len: u32,
-    cv: *const CVBytes,
-    counter: u64,
-    flags: u32,
-    out: *mut CVBytes, // may overlap the input
-);
-
-unsafe extern "C" fn compress_init(
-    block: *const BlockBytes,
-    block_len: u32,
-    cv: *const CVBytes,
-    counter: u64,
-    flags: u32,
-    out: *mut CVBytes,
-) {
-    init_detected_impl();
-    DETECTED_IMPL.compress_fn()(block, block_len, cv, counter, flags, out);
-}
-
-type CompressXofFn = unsafe extern "C" fn(
-    block: *const BlockBytes, // zero padded to 64 bytes
-    block_len: u32,
-    cv: *const CVBytes,
-    counter: u64,
-    flags: u32,
-    out: *mut BlockBytes, // may overlap the input
-);
-
-type HashChunksFn = unsafe extern "C" fn(
-    input: *const u8,
-    input_len: usize,
-    key: *const CVBytes,
-    counter: u64,
-    flags: u32,
-    transposed_output: *mut u32,
-);
-
-unsafe extern "C" fn hash_chunks_init(
-    input: *const u8,
-    input_len: usize,
-    key: *const CVBytes,
-    counter: u64,
-    flags: u32,
-    transposed_output: *mut u32,
-) {
-    init_detected_impl();
-    DETECTED_IMPL.hash_chunks_fn()(input, input_len, key, counter, flags, transposed_output);
-}
-
-type HashParentsFn = unsafe extern "C" fn(
-    transposed_input: *const u32,
-    num_parents: usize,
-    key: *const CVBytes,
-    flags: u32,
-    transposed_output: *mut u32, // may overlap the input
-);
-
-unsafe extern "C" fn hash_parents_init(
-    transposed_input: *const u32,
-    num_parents: usize,
-    key: *const CVBytes,
-    flags: u32,
-    transposed_output: *mut u32,
-) {
-    init_detected_impl();
-    DETECTED_IMPL.hash_parents_fn()(transposed_input, num_parents, key, flags, transposed_output);
-}
-
-// This signature covers both xof() and xof_xor().
-type XofFn = unsafe extern "C" fn(
-    block: *const BlockBytes, // zero padded to 64 bytes
-    block_len: u32,
-    cv: *const CVBytes,
-    counter: u64,
-    flags: u32,
-    out: *mut u8,
-    out_len: usize,
-);
-
-unsafe extern "C" fn xof_init(
-    block: *const BlockBytes,
-    block_len: u32,
-    cv: *const CVBytes,
-    counter: u64,
-    flags: u32,
-    out: *mut u8,
-    out_len: usize,
-) {
-    init_detected_impl();
-    DETECTED_IMPL.xof_fn()(block, block_len, cv, counter, flags, out, out_len);
-}
-
-unsafe extern "C" fn xof_xor_init(
-    block: *const BlockBytes,
-    block_len: u32,
-    cv: *const CVBytes,
-    counter: u64,
-    flags: u32,
-    out: *mut u8,
-    out_len: usize,
-) {
-    init_detected_impl();
-    DETECTED_IMPL.xof_xor_fn()(block, block_len, cv, counter, flags, out, out_len);
-}
-
-type UniversalHashFn = unsafe extern "C" fn(
-    input: *const u8,
-    input_len: usize,
-    key: *const CVBytes,
-    counter: u64,
-    out: *mut [u8; 16],
-);
-
-unsafe extern "C" fn universal_hash_init(
-    input: *const u8,
-    input_len: usize,
-    key: *const CVBytes,
-    counter: u64,
-    out: *mut [u8; 16],
-) {
-    init_detected_impl();
-    DETECTED_IMPL.universal_hash_fn()(input, input_len, key, counter, out);
-}
-
-// The implicit degree of this implementation is MAX_SIMD_DEGREE.
-#[inline(always)]
-unsafe fn hash_chunks_using_compress(
-    compress: CompressFn,
-    mut input: *const u8,
-    mut input_len: usize,
-    key: *const CVBytes,
-    mut counter: u64,
-    flags: u32,
-    mut transposed_output: *mut u32,
-) {
-    debug_assert!(input_len > 0);
-    debug_assert!(input_len <= MAX_SIMD_DEGREE * CHUNK_LEN);
-    input_len = cmp::min(input_len, MAX_SIMD_DEGREE * CHUNK_LEN);
-    while input_len > 0 {
-        let mut chunk_len = cmp::min(input_len, CHUNK_LEN);
-        input_len -= chunk_len;
-        // We only use 8 words of the CV, but compress returns 16.
-        let mut cv = *key;
-        let cv_ptr: *mut CVBytes = &mut cv;
-        let mut chunk_flags = flags | CHUNK_START;
-        while chunk_len > BLOCK_LEN {
-            compress(
-                input as *const BlockBytes,
-                BLOCK_LEN as u32,
-                cv_ptr,
-                counter,
-                chunk_flags,
-                cv_ptr,
-            );
-            input = input.add(BLOCK_LEN);
-            chunk_len -= BLOCK_LEN;
-            chunk_flags &= !CHUNK_START;
-        }
-        let mut last_block = [0u8; BLOCK_LEN];
-        ptr::copy_nonoverlapping(input, last_block.as_mut_ptr(), chunk_len);
-        input = input.add(chunk_len);
-        compress(
-            &last_block,
-            chunk_len as u32,
-            cv_ptr,
-            counter,
-            chunk_flags | CHUNK_END,
-            cv_ptr,
-        );
-        let cv_words = words_from_le_bytes_32(&cv);
-        for word_index in 0..8 {
-            transposed_output
-                .add(word_index * TRANSPOSED_STRIDE)
-                .write(cv_words[word_index]);
-        }
-        transposed_output = transposed_output.add(1);
-        counter += 1;
-    }
-}
-
-// The implicit degree of this implementation is MAX_SIMD_DEGREE.
-#[inline(always)]
-unsafe fn hash_parents_using_compress(
-    compress: CompressFn,
-    mut transposed_input: *const u32,
-    mut num_parents: usize,
-    key: *const CVBytes,
-    flags: u32,
-    mut transposed_output: *mut u32, // may overlap the input
-) {
-    debug_assert!(num_parents > 0);
-    debug_assert!(num_parents <= MAX_SIMD_DEGREE);
-    while num_parents > 0 {
-        let mut block_bytes = [0u8; 64];
-        for word_index in 0..8 {
-            let left_child_word = transposed_input.add(word_index * TRANSPOSED_STRIDE).read();
-            block_bytes[WORD_LEN * word_index..][..WORD_LEN]
-                .copy_from_slice(&left_child_word.to_le_bytes());
-            let right_child_word = transposed_input
-                .add(word_index * TRANSPOSED_STRIDE + 1)
-                .read();
-            block_bytes[WORD_LEN * (word_index + 8)..][..WORD_LEN]
-                .copy_from_slice(&right_child_word.to_le_bytes());
-        }
-        let mut cv = [0u8; 32];
-        compress(&block_bytes, BLOCK_LEN as u32, key, 0, flags, &mut cv);
-        let cv_words = words_from_le_bytes_32(&cv);
-        for word_index in 0..8 {
-            transposed_output
-                .add(word_index * TRANSPOSED_STRIDE)
-                .write(cv_words[word_index]);
-        }
-        transposed_input = transposed_input.add(2);
-        transposed_output = transposed_output.add(1);
-        num_parents -= 1;
-    }
-}
-
-#[inline(always)]
-unsafe fn xof_using_compress_xof(
-    compress_xof: CompressXofFn,
-    block: *const BlockBytes,
-    block_len: u32,
-    cv: *const CVBytes,
-    mut counter: u64,
-    flags: u32,
-    mut out: *mut u8,
-    mut out_len: usize,
-) {
-    debug_assert!(out_len <= MAX_SIMD_DEGREE * BLOCK_LEN);
-    while out_len > 0 {
-        let mut block_output = [0u8; 64];
-        compress_xof(block, block_len, cv, counter, flags, &mut block_output);
-        let take = cmp::min(out_len, BLOCK_LEN);
-        ptr::copy_nonoverlapping(block_output.as_ptr(), out, take);
-        out = out.add(take);
-        out_len -= take;
-        counter += 1;
-    }
-}
-
-#[inline(always)]
-unsafe fn xof_xor_using_compress_xof(
-    compress_xof: CompressXofFn,
-    block: *const BlockBytes,
-    block_len: u32,
-    cv: *const CVBytes,
-    mut counter: u64,
-    flags: u32,
-    mut out: *mut u8,
-    mut out_len: usize,
-) {
-    debug_assert!(out_len <= MAX_SIMD_DEGREE * BLOCK_LEN);
-    while out_len > 0 {
-        let mut block_output = [0u8; 64];
-        compress_xof(block, block_len, cv, counter, flags, &mut block_output);
-        let take = cmp::min(out_len, BLOCK_LEN);
-        for i in 0..take {
-            *out.add(i) ^= block_output[i];
-        }
-        out = out.add(take);
-        out_len -= take;
-        counter += 1;
-    }
-}
-
-#[inline(always)]
-unsafe fn universal_hash_using_compress(
-    compress: CompressFn,
-    mut input: *const u8,
-    mut input_len: usize,
-    key: *const CVBytes,
-    mut counter: u64,
-    out: *mut [u8; 16],
-) {
-    let flags = KEYED_HASH | CHUNK_START | CHUNK_END | ROOT;
-    let mut result = [0u8; 16];
-    while input_len > 0 {
-        let block_len = cmp::min(input_len, BLOCK_LEN);
-        let mut block = [0u8; BLOCK_LEN];
-        ptr::copy_nonoverlapping(input, block.as_mut_ptr(), block_len);
-        let mut block_output = [0u8; 32];
-        compress(
-            &block,
-            block_len as u32,
-            key,
-            counter,
-            flags,
-            &mut block_output,
-        );
-        for i in 0..16 {
-            result[i] ^= block_output[i];
-        }
-        input = input.add(block_len);
-        input_len -= block_len;
-        counter += 1;
-    }
-    *out = result;
-}
-
-// this is in units of *words*, for pointer operations on *const/*mut u32
-const TRANSPOSED_STRIDE: usize = 2 * MAX_SIMD_DEGREE;
-
-#[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"), repr(C, align(64)))]
-#[derive(Clone, Debug, PartialEq, Eq)]
-pub struct TransposedVectors([[u32; 2 * MAX_SIMD_DEGREE]; 8]);
-
-impl TransposedVectors {
-    pub fn new() -> Self {
-        Self([[0; 2 * MAX_SIMD_DEGREE]; 8])
-    }
-
-    pub fn extract_cv(&self, cv_index: usize) -> CVBytes {
-        let mut words = [0u32; 8];
-        for word_index in 0..8 {
-            words[word_index] = self.0[word_index][cv_index];
-        }
-        le_bytes_from_words_32(&words)
-    }
-
-    pub fn extract_parent_node(&self, parent_index: usize) -> BlockBytes {
-        let mut bytes = [0u8; 64];
-        bytes[..32].copy_from_slice(&self.extract_cv(parent_index / 2));
-        bytes[32..].copy_from_slice(&self.extract_cv(parent_index / 2 + 1));
-        bytes
-    }
-
-    fn as_ptr(&self) -> *const u32 {
-        self.0[0].as_ptr()
-    }
-
-    fn as_mut_ptr(&mut self) -> *mut u32 {
-        self.0[0].as_mut_ptr()
-    }
-
-    // SAFETY: This function is just pointer arithmetic, but callers assume that it's safe (not
-    // necessarily correct) to write up to `degree` words to either side of the split, possibly
-    // from different threads.
-    unsafe fn split(&mut self, degree: usize) -> (TransposedSplit, TransposedSplit) {
-        debug_assert!(degree > 0);
-        debug_assert!(degree <= MAX_SIMD_DEGREE);
-        debug_assert_eq!(degree.count_ones(), 1, "power of 2");
-        let ptr = self.as_mut_ptr();
-        let left = TransposedSplit {
-            ptr,
-            phantom_data: PhantomData,
-        };
-        let right = TransposedSplit {
-            ptr: ptr.wrapping_add(degree),
-            phantom_data: PhantomData,
-        };
-        (left, right)
-    }
-}
-
-pub struct TransposedSplit<'vectors> {
-    ptr: *mut u32,
-    phantom_data: PhantomData<&'vectors mut u32>,
-}
-
-unsafe impl<'vectors> Send for TransposedSplit<'vectors> {}
-unsafe impl<'vectors> Sync for TransposedSplit<'vectors> {}
-
-unsafe fn read_transposed_cv(src: *const u32) -> CVWords {
-    let mut cv = [0u32; 8];
-    for word_index in 0..8 {
-        let offset_words = word_index * TRANSPOSED_STRIDE;
-        cv[word_index] = src.add(offset_words).read();
-    }
-    cv
-}
-
-unsafe fn write_transposed_cv(cv: &CVWords, dest: *mut u32) {
-    for word_index in 0..8 {
-        let offset_words = word_index * TRANSPOSED_STRIDE;
-        dest.add(offset_words).write(cv[word_index]);
-    }
-}
-
-#[inline(always)]
-pub const fn le_bytes_from_words_32(words: &CVWords) -> CVBytes {
-    let mut bytes = [0u8; 32];
-    // This loop is super verbose because currently that's what it takes to be const.
-    let mut word_index = 0;
-    while word_index < bytes.len() / WORD_LEN {
-        let word_bytes = words[word_index].to_le_bytes();
-        let mut byte_index = 0;
-        while byte_index < WORD_LEN {
-            bytes[word_index * WORD_LEN + byte_index] = word_bytes[byte_index];
-            byte_index += 1;
-        }
-        word_index += 1;
-    }
-    bytes
-}
-
-#[inline(always)]
-pub const fn le_bytes_from_words_64(words: &BlockWords) -> BlockBytes {
-    let mut bytes = [0u8; 64];
-    // This loop is super verbose because currently that's what it takes to be const.
-    let mut word_index = 0;
-    while word_index < bytes.len() / WORD_LEN {
-        let word_bytes = words[word_index].to_le_bytes();
-        let mut byte_index = 0;
-        while byte_index < WORD_LEN {
-            bytes[word_index * WORD_LEN + byte_index] = word_bytes[byte_index];
-            byte_index += 1;
-        }
-        word_index += 1;
-    }
-    bytes
-}
-
-#[inline(always)]
-pub const fn words_from_le_bytes_32(bytes: &CVBytes) -> CVWords {
-    let mut words = [0u32; 8];
-    // This loop is super verbose because currently that's what it takes to be const.
-    let mut word_index = 0;
-    while word_index < words.len() {
-        let mut word_bytes = [0u8; WORD_LEN];
-        let mut byte_index = 0;
-        while byte_index < WORD_LEN {
-            word_bytes[byte_index] = bytes[word_index * WORD_LEN + byte_index];
-            byte_index += 1;
-        }
-        words[word_index] = u32::from_le_bytes(word_bytes);
-        word_index += 1;
-    }
-    words
-}
-
-#[inline(always)]
-pub const fn words_from_le_bytes_64(bytes: &BlockBytes) -> BlockWords {
-    let mut words = [0u32; 16];
-    // This loop is super verbose because currently that's what it takes to be const.
-    let mut word_index = 0;
-    while word_index < words.len() {
-        let mut word_bytes = [0u8; WORD_LEN];
-        let mut byte_index = 0;
-        while byte_index < WORD_LEN {
-            word_bytes[byte_index] = bytes[word_index * WORD_LEN + byte_index];
-            byte_index += 1;
-        }
-        words[word_index] = u32::from_le_bytes(word_bytes);
-        word_index += 1;
-    }
-    words
-}
-
-#[test]
-fn test_byte_word_round_trips() {
-    let cv = *b"This is 32 LE bytes/eight words.";
-    assert_eq!(cv, le_bytes_from_words_32(&words_from_le_bytes_32(&cv)));
-    let block = *b"This is sixty-four little-endian bytes, or sixteen 32-bit words.";
-    assert_eq!(
-        block,
-        le_bytes_from_words_64(&words_from_le_bytes_64(&block)),
-    );
-}
-
-// The largest power of two less than or equal to `n`, used for left_len()
-// immediately below, and also directly in Hasher::update().
-pub fn largest_power_of_two_leq(n: usize) -> usize {
-    ((n / 2) + 1).next_power_of_two()
-}
-
-#[test]
-fn test_largest_power_of_two_leq() {
-    let input_output = &[
-        // The zero case is nonsensical, but it does work.
-        (0, 1),
-        (1, 1),
-        (2, 2),
-        (3, 2),
-        (4, 4),
-        (5, 4),
-        (6, 4),
-        (7, 4),
-        (8, 8),
-        // the largest possible usize
-        (usize::MAX, (usize::MAX >> 1) + 1),
-    ];
-    for &(input, output) in input_output {
-        assert_eq!(
-            output,
-            crate::largest_power_of_two_leq(input),
-            "wrong output for n={}",
-            input
-        );
-    }
-}
-
-// Given some input larger than one chunk, return the number of bytes that
-// should go in the left subtree. This is the largest power-of-2 number of
-// chunks that leaves at least 1 byte for the right subtree.
-pub fn left_len(content_len: usize) -> usize {
-    debug_assert!(content_len > CHUNK_LEN);
-    // Subtract 1 to reserve at least one byte for the right side.
-    let full_chunks = (content_len - 1) / CHUNK_LEN;
-    largest_power_of_two_leq(full_chunks) * CHUNK_LEN
-}
-
-#[test]
-fn test_left_len() {
-    let input_output = &[
-        (CHUNK_LEN + 1, CHUNK_LEN),
-        (2 * CHUNK_LEN - 1, CHUNK_LEN),
-        (2 * CHUNK_LEN, CHUNK_LEN),
-        (2 * CHUNK_LEN + 1, 2 * CHUNK_LEN),
-        (4 * CHUNK_LEN - 1, 2 * CHUNK_LEN),
-        (4 * CHUNK_LEN, 2 * CHUNK_LEN),
-        (4 * CHUNK_LEN + 1, 4 * CHUNK_LEN),
-    ];
-    for &(input, output) in input_output {
-        assert_eq!(left_len(input), output);
-    }
-}
diff --git a/third-party/blake3/rust/guts/src/portable.rs b/third-party/blake3/rust/guts/src/portable.rs
deleted file mode 100644
index d597644002..0000000000
--- a/third-party/blake3/rust/guts/src/portable.rs
+++ /dev/null
@@ -1,262 +0,0 @@
-use crate::{
-    le_bytes_from_words_32, le_bytes_from_words_64, words_from_le_bytes_32, words_from_le_bytes_64,
-    BlockBytes, BlockWords, CVBytes, CVWords, Implementation, IV, MAX_SIMD_DEGREE, MSG_SCHEDULE,
-};
-
-const DEGREE: usize = MAX_SIMD_DEGREE;
-
-unsafe extern "C" fn degree() -> usize {
-    DEGREE
-}
-
-#[inline(always)]
-fn g(state: &mut BlockWords, a: usize, b: usize, c: usize, d: usize, x: u32, y: u32) {
-    state[a] = state[a].wrapping_add(state[b]).wrapping_add(x);
-    state[d] = (state[d] ^ state[a]).rotate_right(16);
-    state[c] = state[c].wrapping_add(state[d]);
-    state[b] = (state[b] ^ state[c]).rotate_right(12);
-    state[a] = state[a].wrapping_add(state[b]).wrapping_add(y);
-    state[d] = (state[d] ^ state[a]).rotate_right(8);
-    state[c] = state[c].wrapping_add(state[d]);
-    state[b] = (state[b] ^ state[c]).rotate_right(7);
-}
-
-#[inline(always)]
-fn round(state: &mut [u32; 16], msg: &BlockWords, round: usize) {
-    // Select the message schedule based on the round.
-    let schedule = MSG_SCHEDULE[round];
-
-    // Mix the columns.
-    g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
-    g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
-    g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
-    g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
-
-    // Mix the diagonals.
-    g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
-    g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
-    g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
-    g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
-}
-
-#[inline(always)]
-fn compress_inner(
-    block_words: &BlockWords,
-    block_len: u32,
-    cv_words: &CVWords,
-    counter: u64,
-    flags: u32,
-) -> [u32; 16] {
-    let mut state = [
-        cv_words[0],
-        cv_words[1],
-        cv_words[2],
-        cv_words[3],
-        cv_words[4],
-        cv_words[5],
-        cv_words[6],
-        cv_words[7],
-        IV[0],
-        IV[1],
-        IV[2],
-        IV[3],
-        counter as u32,
-        (counter >> 32) as u32,
-        block_len as u32,
-        flags as u32,
-    ];
-    for round_number in 0..7 {
-        round(&mut state, &block_words, round_number);
-    }
-    state
-}
-
-pub(crate) unsafe extern "C" fn compress(
-    block: *const BlockBytes,
-    block_len: u32,
-    cv: *const CVBytes,
-    counter: u64,
-    flags: u32,
-    out: *mut CVBytes,
-) {
-    let block_words = words_from_le_bytes_64(&*block);
-    let cv_words = words_from_le_bytes_32(&*cv);
-    let mut state = compress_inner(&block_words, block_len, &cv_words, counter, flags);
-    for word_index in 0..8 {
-        state[word_index] ^= state[word_index + 8];
-    }
-    *out = le_bytes_from_words_32(state[..8].try_into().unwrap());
-}
-
-pub(crate) unsafe extern "C" fn compress_xof(
-    block: *const BlockBytes,
-    block_len: u32,
-    cv: *const CVBytes,
-    counter: u64,
-    flags: u32,
-    out: *mut BlockBytes,
-) {
-    let block_words = words_from_le_bytes_64(&*block);
-    let cv_words = words_from_le_bytes_32(&*cv);
-    let mut state = compress_inner(&block_words, block_len, &cv_words, counter, flags);
-    for word_index in 0..8 {
-        state[word_index] ^= state[word_index + 8];
-        state[word_index + 8] ^= cv_words[word_index];
-    }
-    *out = le_bytes_from_words_64(&state);
-}
-
-pub(crate) unsafe extern "C" fn hash_chunks(
-    input: *const u8,
-    input_len: usize,
-    key: *const CVBytes,
-    counter: u64,
-    flags: u32,
-    transposed_output: *mut u32,
-) {
-    crate::hash_chunks_using_compress(
-        compress,
-        input,
-        input_len,
-        key,
-        counter,
-        flags,
-        transposed_output,
-    )
-}
-
-pub(crate) unsafe extern "C" fn hash_parents(
-    transposed_input: *const u32,
-    num_parents: usize,
-    key: *const CVBytes,
-    flags: u32,
-    transposed_output: *mut u32, // may overlap the input
-) {
-    crate::hash_parents_using_compress(
-        compress,
-        transposed_input,
-        num_parents,
-        key,
-        flags,
-        transposed_output,
-    )
-}
-
-pub(crate) unsafe extern "C" fn xof(
-    block: *const BlockBytes,
-    block_len: u32,
-    cv: *const CVBytes,
-    counter: u64,
-    flags: u32,
-    out: *mut u8,
-    out_len: usize,
-) {
-    crate::xof_using_compress_xof(
-        compress_xof,
-        block,
-        block_len,
-        cv,
-        counter,
-        flags,
-        out,
-        out_len,
-    )
-}
-
-pub(crate) unsafe extern "C" fn xof_xor(
-    block: *const BlockBytes,
-    block_len: u32,
-    cv: *const CVBytes,
-    counter: u64,
-    flags: u32,
-    out: *mut u8,
-    out_len: usize,
-) {
-    crate::xof_xor_using_compress_xof(
-        compress_xof,
-        block,
-        block_len,
-        cv,
-        counter,
-        flags,
-        out,
-        out_len,
-    )
-}
-
-pub(crate) unsafe extern "C" fn universal_hash(
-    input: *const u8,
-    input_len: usize,
-    key: *const CVBytes,
-    counter: u64,
-    out: *mut [u8; 16],
-) {
-    crate::universal_hash_using_compress(compress, input, input_len, key, counter, out)
-}
-
-pub fn implementation() -> Implementation {
-    Implementation::new(
-        degree,
-        compress,
-        hash_chunks,
-        hash_parents,
-        xof,
-        xof_xor,
-        universal_hash,
-    )
-}
-
-#[cfg(test)]
-mod test {
-    use super::*;
-
-    // This is circular but do it anyway.
-    #[test]
-    fn test_compress_vs_portable() {
-        crate::test::test_compress_vs_portable(&implementation());
-    }
-
-    #[test]
-    fn test_compress_vs_reference() {
-        crate::test::test_compress_vs_reference(&implementation());
-    }
-
-    // This is circular but do it anyway.
-    #[test]
-    fn test_hash_chunks_vs_portable() {
-        crate::test::test_hash_chunks_vs_portable(&implementation());
-    }
-
-    // This is circular but do it anyway.
-    #[test]
-    fn test_hash_parents_vs_portable() {
-        crate::test::test_hash_parents_vs_portable(&implementation());
-    }
-
-    #[test]
-    fn test_chunks_and_parents_vs_reference() {
-        crate::test::test_chunks_and_parents_vs_reference(&implementation());
-    }
-
-    // This is circular but do it anyway.
-    #[test]
-    fn test_xof_vs_portable() {
-        crate::test::test_xof_vs_portable(&implementation());
-    }
-
-    #[test]
-    fn test_xof_vs_reference() {
-        crate::test::test_xof_vs_reference(&implementation());
-    }
-
-    // This is circular but do it anyway.
-    #[test]
-    fn test_universal_hash_vs_portable() {
-        crate::test::test_universal_hash_vs_portable(&implementation());
-    }
-
-    #[test]
-    fn test_universal_hash_vs_reference() {
-        crate::test::test_universal_hash_vs_reference(&implementation());
-    }
-}
diff --git a/third-party/blake3/rust/guts/src/test.rs b/third-party/blake3/rust/guts/src/test.rs
deleted file mode 100644
index 83bd790cab..0000000000
--- a/third-party/blake3/rust/guts/src/test.rs
+++ /dev/null
@@ -1,523 +0,0 @@
-use crate::*;
-
-pub const TEST_KEY: CVBytes = *b"whats the Elvish word for friend";
-
-// Test a few different initial counter values.
-// - 0: The base case.
-// - i32::MAX: *No* overflow. But carry bugs in tricky SIMD code can screw this up, if you XOR when
-//   you're supposed to ANDNOT.
-// - u32::MAX: The low word of the counter overflows for all inputs except the first.
-// - (42 << 32) + u32::MAX: Same but with a non-zero value in the high word.
-const INITIAL_COUNTERS: [u64; 4] = [
-    0,
-    i32::MAX as u64,
-    u32::MAX as u64,
-    (42u64 << 32) + u32::MAX as u64,
-];
-
-const BLOCK_LENGTHS: [usize; 4] = [0, 1, 63, 64];
-
-pub fn paint_test_input(buf: &mut [u8]) {
-    for (i, b) in buf.iter_mut().enumerate() {
-        *b = (i % 251) as u8;
-    }
-}
-
-pub fn test_compress_vs_portable(test_impl: &Implementation) {
-    for block_len in BLOCK_LENGTHS {
-        dbg!(block_len);
-        let mut block = [0; BLOCK_LEN];
-        paint_test_input(&mut block[..block_len]);
-        for counter in INITIAL_COUNTERS {
-            dbg!(counter);
-            let portable_cv = portable::implementation().compress(
-                &block,
-                block_len as u32,
-                &TEST_KEY,
-                counter,
-                KEYED_HASH,
-            );
-
-            let test_cv =
-                test_impl.compress(&block, block_len as u32, &TEST_KEY, counter, KEYED_HASH);
-
-            assert_eq!(portable_cv, test_cv);
-        }
-    }
-}
-
-pub fn test_compress_vs_reference(test_impl: &Implementation) {
-    for block_len in BLOCK_LENGTHS {
-        dbg!(block_len);
-        let mut block = [0; BLOCK_LEN];
-        paint_test_input(&mut block[..block_len]);
-
-        let mut ref_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY);
-        ref_hasher.update(&block[..block_len]);
-        let mut ref_hash = [0u8; 32];
-        ref_hasher.finalize(&mut ref_hash);
-
-        let test_cv = test_impl.compress(
-            &block,
-            block_len as u32,
-            &TEST_KEY,
-            0,
-            CHUNK_START | CHUNK_END | ROOT | KEYED_HASH,
-        );
-
-        assert_eq!(ref_hash, test_cv);
-    }
-}
-
-fn check_transposed_eq(output_a: &TransposedVectors, output_b: &TransposedVectors) {
-    if output_a == output_b {
-        return;
-    }
-    for cv_index in 0..2 * MAX_SIMD_DEGREE {
-        let cv_a = output_a.extract_cv(cv_index);
-        let cv_b = output_b.extract_cv(cv_index);
-        if cv_a == [0; 32] && cv_b == [0; 32] {
-            println!("CV {cv_index:2} empty");
-        } else if cv_a == cv_b {
-            println!("CV {cv_index:2} matches");
-        } else {
-            println!("CV {cv_index:2} mismatch:");
-            println!("    {}", hex::encode(cv_a));
-            println!("    {}", hex::encode(cv_b));
-        }
-    }
-    panic!("transposed outputs are not equal");
-}
-
-pub fn test_hash_chunks_vs_portable(test_impl: &Implementation) {
-    assert!(test_impl.degree() <= MAX_SIMD_DEGREE);
-    dbg!(test_impl.degree() * CHUNK_LEN);
-    // Allocate 4 extra bytes of padding so we can make aligned slices.
-    let mut input_buf = [0u8; 2 * 2 * MAX_SIMD_DEGREE * CHUNK_LEN + 4];
-    let mut input_slice = &mut input_buf[..];
-    // Make sure the start of the input is word-aligned.
-    while input_slice.as_ptr() as usize % 4 != 0 {
-        input_slice = &mut input_slice[1..];
-    }
-    let (aligned_input, mut unaligned_input) =
-        input_slice.split_at_mut(2 * MAX_SIMD_DEGREE * CHUNK_LEN);
-    unaligned_input = &mut unaligned_input[1..][..2 * MAX_SIMD_DEGREE * CHUNK_LEN];
-    assert_eq!(aligned_input.as_ptr() as usize % 4, 0);
-    assert_eq!(unaligned_input.as_ptr() as usize % 4, 1);
-    paint_test_input(aligned_input);
-    paint_test_input(unaligned_input);
-    // Try just below, equal to, and just above every whole number of chunks.
-    let mut input_2_lengths = Vec::new();
-    let mut next_len = 2 * CHUNK_LEN;
-    loop {
-        // 95 is one whole block plus one interesting part of another
-        input_2_lengths.push(next_len - 95);
-        input_2_lengths.push(next_len);
-        if next_len == test_impl.degree() * CHUNK_LEN {
-            break;
-        }
-        input_2_lengths.push(next_len + 95);
-        next_len += CHUNK_LEN;
-    }
-    for input_2_len in input_2_lengths {
-        dbg!(input_2_len);
-        let aligned_input1 = &aligned_input[..test_impl.degree() * CHUNK_LEN];
-        let aligned_input2 = &aligned_input[test_impl.degree() * CHUNK_LEN..][..input_2_len];
-        let unaligned_input1 = &unaligned_input[..test_impl.degree() * CHUNK_LEN];
-        let unaligned_input2 = &unaligned_input[test_impl.degree() * CHUNK_LEN..][..input_2_len];
-        for initial_counter in INITIAL_COUNTERS {
-            dbg!(initial_counter);
-            // Make two calls, to test the output_column parameter.
-            let mut portable_output = TransposedVectors::new();
-            let (portable_left, portable_right) =
-                test_impl.split_transposed_vectors(&mut portable_output);
-            portable::implementation().hash_chunks(
-                aligned_input1,
-                &IV_BYTES,
-                initial_counter,
-                0,
-                portable_left,
-            );
-            portable::implementation().hash_chunks(
-                aligned_input2,
-                &TEST_KEY,
-                initial_counter + test_impl.degree() as u64,
-                KEYED_HASH,
-                portable_right,
-            );
-
-            let mut test_output = TransposedVectors::new();
-            let (test_left, test_right) = test_impl.split_transposed_vectors(&mut test_output);
-            test_impl.hash_chunks(aligned_input1, &IV_BYTES, initial_counter, 0, test_left);
-            test_impl.hash_chunks(
-                aligned_input2,
-                &TEST_KEY,
-                initial_counter + test_impl.degree() as u64,
-                KEYED_HASH,
-                test_right,
-            );
-            check_transposed_eq(&portable_output, &test_output);
-
-            // Do the same thing with unaligned input.
-            let mut unaligned_test_output = TransposedVectors::new();
-            let (unaligned_left, unaligned_right) =
-                test_impl.split_transposed_vectors(&mut unaligned_test_output);
-            test_impl.hash_chunks(
-                unaligned_input1,
-                &IV_BYTES,
-                initial_counter,
-                0,
-                unaligned_left,
-            );
-            test_impl.hash_chunks(
-                unaligned_input2,
-                &TEST_KEY,
-                initial_counter + test_impl.degree() as u64,
-                KEYED_HASH,
-                unaligned_right,
-            );
-            check_transposed_eq(&portable_output, &unaligned_test_output);
-        }
-    }
-}
-
-fn painted_transposed_input() -> TransposedVectors {
-    let mut vectors = TransposedVectors::new();
-    let mut val = 0;
-    for col in 0..2 * MAX_SIMD_DEGREE {
-        for row in 0..8 {
-            vectors.0[row][col] = val;
-            val += 1;
-        }
-    }
-    vectors
-}
-
-pub fn test_hash_parents_vs_portable(test_impl: &Implementation) {
-    assert!(test_impl.degree() <= MAX_SIMD_DEGREE);
-    let input = painted_transposed_input();
-    for num_parents in 2..=(test_impl.degree() / 2) {
-        dbg!(num_parents);
-        let mut portable_output = TransposedVectors::new();
-        let (portable_left, portable_right) =
-            test_impl.split_transposed_vectors(&mut portable_output);
-        portable::implementation().hash_parents(
-            &input,
-            2 * num_parents, // num_cvs
-            &IV_BYTES,
-            0,
-            portable_left,
-        );
-        portable::implementation().hash_parents(
-            &input,
-            2 * num_parents, // num_cvs
-            &TEST_KEY,
-            KEYED_HASH,
-            portable_right,
-        );
-
-        let mut test_output = TransposedVectors::new();
-        let (test_left, test_right) = test_impl.split_transposed_vectors(&mut test_output);
-        test_impl.hash_parents(
-            &input,
-            2 * num_parents, // num_cvs
-            &IV_BYTES,
-            0,
-            test_left,
-        );
-        test_impl.hash_parents(
-            &input,
-            2 * num_parents, // num_cvs
-            &TEST_KEY,
-            KEYED_HASH,
-            test_right,
-        );
-
-        check_transposed_eq(&portable_output, &test_output);
-    }
-}
-
-fn hash_with_chunks_and_parents_recurse(
-    test_impl: &Implementation,
-    input: &[u8],
-    counter: u64,
-    output: TransposedSplit,
-) -> usize {
-    assert!(input.len() > 0);
-    if input.len() <= test_impl.degree() * CHUNK_LEN {
-        return test_impl.hash_chunks(input, &IV_BYTES, counter, 0, output);
-    }
-    let (left_input, right_input) = input.split_at(left_len(input.len()));
-    let mut child_output = TransposedVectors::new();
-    let (left_output, right_output) = test_impl.split_transposed_vectors(&mut child_output);
-    let mut children =
-        hash_with_chunks_and_parents_recurse(test_impl, left_input, counter, left_output);
-    assert_eq!(children, test_impl.degree());
-    children += hash_with_chunks_and_parents_recurse(
-        test_impl,
-        right_input,
-        counter + (left_input.len() / CHUNK_LEN) as u64,
-        right_output,
-    );
-    test_impl.hash_parents(&child_output, children, &IV_BYTES, PARENT, output)
-}
-
-// Note: This test implementation doesn't support the 1-chunk-or-less case.
-fn root_hash_with_chunks_and_parents(test_impl: &Implementation, input: &[u8]) -> CVBytes {
-    // TODO: handle the 1-chunk case?
-    assert!(input.len() > CHUNK_LEN);
-    let mut cvs = TransposedVectors::new();
-    // The right half of these vectors are never used.
-    let (cvs_left, _) = test_impl.split_transposed_vectors(&mut cvs);
-    let mut num_cvs = hash_with_chunks_and_parents_recurse(test_impl, input, 0, cvs_left);
-    while num_cvs > 2 {
-        num_cvs = test_impl.reduce_parents(&mut cvs, num_cvs, &IV_BYTES, 0);
-    }
-    test_impl.compress(
-        &cvs.extract_parent_node(0),
-        BLOCK_LEN as u32,
-        &IV_BYTES,
-        0,
-        PARENT | ROOT,
-    )
-}
-
-pub fn test_chunks_and_parents_vs_reference(test_impl: &Implementation) {
-    assert_eq!(test_impl.degree().count_ones(), 1, "power of 2");
-    const MAX_INPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * CHUNK_LEN;
-    let mut input_buf = [0u8; MAX_INPUT_LEN];
-    paint_test_input(&mut input_buf);
-    // Try just below, equal to, and just above every whole number of chunks, except that
-    // root_hash_with_chunks_and_parents doesn't support the 1-chunk-or-less case.
-    let mut test_lengths = vec![CHUNK_LEN + 1];
-    let mut next_len = 2 * CHUNK_LEN;
-    loop {
-        // 95 is one whole block plus one interesting part of another
-        test_lengths.push(next_len - 95);
-        test_lengths.push(next_len);
-        if next_len == MAX_INPUT_LEN {
-            break;
-        }
-        test_lengths.push(next_len + 95);
-        next_len += CHUNK_LEN;
-    }
-    for test_len in test_lengths {
-        dbg!(test_len);
-        let input = &input_buf[..test_len];
-
-        let mut ref_hasher = reference_impl::Hasher::new();
-        ref_hasher.update(&input);
-        let mut ref_hash = [0u8; 32];
-        ref_hasher.finalize(&mut ref_hash);
-
-        let test_hash = root_hash_with_chunks_and_parents(test_impl, input);
-
-        assert_eq!(ref_hash, test_hash);
-    }
-}
-
-pub fn test_xof_vs_portable(test_impl: &Implementation) {
-    let flags = CHUNK_START | CHUNK_END | KEYED_HASH;
-    for counter in INITIAL_COUNTERS {
-        dbg!(counter);
-        for input_len in [0, 1, BLOCK_LEN] {
-            dbg!(input_len);
-            let mut input_block = [0u8; BLOCK_LEN];
-            for byte_index in 0..input_len {
-                input_block[byte_index] = byte_index as u8 + 42;
-            }
-            // Try equal to and partway through every whole number of output blocks.
-            const MAX_OUTPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN;
-            let mut output_lengths = Vec::new();
-            let mut next_len = 0;
-            loop {
-                output_lengths.push(next_len);
-                if next_len == MAX_OUTPUT_LEN {
-                    break;
-                }
-                output_lengths.push(next_len + 31);
-                next_len += BLOCK_LEN;
-            }
-            for output_len in output_lengths {
-                dbg!(output_len);
-                let mut portable_output = [0xff; MAX_OUTPUT_LEN];
-                portable::implementation().xof(
-                    &input_block,
-                    input_len as u32,
-                    &TEST_KEY,
-                    counter,
-                    flags,
-                    &mut portable_output[..output_len],
-                );
-                let mut test_output = [0xff; MAX_OUTPUT_LEN];
-                test_impl.xof(
-                    &input_block,
-                    input_len as u32,
-                    &TEST_KEY,
-                    counter,
-                    flags,
-                    &mut test_output[..output_len],
-                );
-                assert_eq!(portable_output, test_output);
-
-                // Double check that the implementation didn't overwrite.
-                assert!(test_output[output_len..].iter().all(|&b| b == 0xff));
-
-                // The first XOR cancels out the output.
-                test_impl.xof_xor(
-                    &input_block,
-                    input_len as u32,
-                    &TEST_KEY,
-                    counter,
-                    flags,
-                    &mut test_output[..output_len],
-                );
-                assert!(test_output[..output_len].iter().all(|&b| b == 0));
-                assert!(test_output[output_len..].iter().all(|&b| b == 0xff));
-
-                // The second XOR restores out the output.
-                test_impl.xof_xor(
-                    &input_block,
-                    input_len as u32,
-                    &TEST_KEY,
-                    counter,
-                    flags,
-                    &mut test_output[..output_len],
-                );
-                assert_eq!(portable_output, test_output);
-                assert!(test_output[output_len..].iter().all(|&b| b == 0xff));
-            }
-        }
-    }
-}
-
-pub fn test_xof_vs_reference(test_impl: &Implementation) {
-    let input = b"hello world";
-    let mut input_block = [0; BLOCK_LEN];
-    input_block[..input.len()].copy_from_slice(input);
-
-    const MAX_OUTPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN;
-    let mut ref_output = [0; MAX_OUTPUT_LEN];
-    let mut ref_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY);
-    ref_hasher.update(input);
-    ref_hasher.finalize(&mut ref_output);
-
-    // Try equal to and partway through every whole number of output blocks.
-    let mut output_lengths = vec![0, 1, 31];
-    let mut next_len = BLOCK_LEN;
-    loop {
-        output_lengths.push(next_len);
-        if next_len == MAX_OUTPUT_LEN {
-            break;
-        }
-        output_lengths.push(next_len + 31);
-        next_len += BLOCK_LEN;
-    }
-
-    for output_len in output_lengths {
-        dbg!(output_len);
-        let mut test_output = [0; MAX_OUTPUT_LEN];
-        test_impl.xof(
-            &input_block,
-            input.len() as u32,
-            &TEST_KEY,
-            0,
-            KEYED_HASH | CHUNK_START | CHUNK_END,
-            &mut test_output[..output_len],
-        );
-        assert_eq!(ref_output[..output_len], test_output[..output_len]);
-
-        // Double check that the implementation didn't overwrite.
-        assert!(test_output[output_len..].iter().all(|&b| b == 0));
-
-        // Do it again starting from block 1.
-        if output_len >= BLOCK_LEN {
-            test_impl.xof(
-                &input_block,
-                input.len() as u32,
-                &TEST_KEY,
-                1,
-                KEYED_HASH | CHUNK_START | CHUNK_END,
-                &mut test_output[..output_len - BLOCK_LEN],
-            );
-            assert_eq!(
-                ref_output[BLOCK_LEN..output_len],
-                test_output[..output_len - BLOCK_LEN],
-            );
-        }
-    }
-}
-
-pub fn test_universal_hash_vs_portable(test_impl: &Implementation) {
-    const MAX_INPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN;
-    let mut input_buf = [0; MAX_INPUT_LEN];
-    paint_test_input(&mut input_buf);
-    // Try equal to and partway through every whole number of input blocks.
-    let mut input_lengths = vec![0, 1, 31];
-    let mut next_len = BLOCK_LEN;
-    loop {
-        input_lengths.push(next_len);
-        if next_len == MAX_INPUT_LEN {
-            break;
-        }
-        input_lengths.push(next_len + 31);
-        next_len += BLOCK_LEN;
-    }
-    for input_len in input_lengths {
-        dbg!(input_len);
-        for counter in INITIAL_COUNTERS {
-            dbg!(counter);
-            let portable_output = portable::implementation().universal_hash(
-                &input_buf[..input_len],
-                &TEST_KEY,
-                counter,
-            );
-            let test_output = test_impl.universal_hash(&input_buf[..input_len], &TEST_KEY, counter);
-            assert_eq!(portable_output, test_output);
-        }
-    }
-}
-
-fn reference_impl_universal_hash(input: &[u8], key: &CVBytes) -> [u8; UNIVERSAL_HASH_LEN] {
-    // The reference_impl doesn't support XOF seeking, so we have to materialize an entire extended
-    // output to seek to a block.
-    const MAX_BLOCKS: usize = 2 * MAX_SIMD_DEGREE;
-    assert!(input.len() / BLOCK_LEN <= MAX_BLOCKS);
-    let mut output_buffer: [u8; BLOCK_LEN * MAX_BLOCKS] = [0u8; BLOCK_LEN * MAX_BLOCKS];
-    let mut result = [0u8; UNIVERSAL_HASH_LEN];
-    let mut block_start = 0;
-    while block_start < input.len() {
-        let block_len = cmp::min(input.len() - block_start, BLOCK_LEN);
-        let mut ref_hasher = reference_impl::Hasher::new_keyed(key);
-        ref_hasher.update(&input[block_start..block_start + block_len]);
-        ref_hasher.finalize(&mut output_buffer[..block_start + UNIVERSAL_HASH_LEN]);
-        for byte_index in 0..UNIVERSAL_HASH_LEN {
-            result[byte_index] ^= output_buffer[block_start + byte_index];
-        }
-        block_start += BLOCK_LEN;
-    }
-    result
-}
-
-pub fn test_universal_hash_vs_reference(test_impl: &Implementation) {
-    const MAX_INPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN;
-    let mut input_buf = [0; MAX_INPUT_LEN];
-    paint_test_input(&mut input_buf);
-    // Try equal to and partway through every whole number of input blocks.
-    let mut input_lengths = vec![0, 1, 31];
-    let mut next_len = BLOCK_LEN;
-    loop {
-        input_lengths.push(next_len);
-        if next_len == MAX_INPUT_LEN {
-            break;
-        }
-        input_lengths.push(next_len + 31);
-        next_len += BLOCK_LEN;
-    }
-    for input_len in input_lengths {
-        dbg!(input_len);
-        let ref_output = reference_impl_universal_hash(&input_buf[..input_len], &TEST_KEY);
-        let test_output = test_impl.universal_hash(&input_buf[..input_len], &TEST_KEY, 0);
-        assert_eq!(ref_output, test_output);
-    }
-}
diff --git a/third-party/blake3/src/ffi_avx512.rs b/third-party/blake3/src/ffi_avx512.rs
index 884f48135d..afa0221bd4 100644
--- a/third-party/blake3/src/ffi_avx512.rs
+++ b/third-party/blake3/src/ffi_avx512.rs
@@ -60,6 +60,28 @@ pub unsafe fn hash_many<const N: usize>(
     )
 }
 
+// Unsafe because this may only be called on platforms supporting AVX-512.
+#[cfg(unix)]
+pub unsafe fn xof_many(
+    cv: &CVWords,
+    block: &[u8; BLOCK_LEN],
+    block_len: u8,
+    counter: u64,
+    flags: u8,
+    out: &mut [u8],
+) {
+    debug_assert_eq!(0, out.len() % BLOCK_LEN, "whole blocks only");
+    ffi::blake3_xof_many_avx512(
+        cv.as_ptr(),
+        block.as_ptr(),
+        block_len,
+        counter,
+        flags,
+        out.as_mut_ptr(),
+        out.len() / BLOCK_LEN,
+    );
+}
+
 pub mod ffi {
     extern "C" {
         pub fn blake3_compress_in_place_avx512(
@@ -89,6 +111,16 @@ pub mod ffi {
             flags_end: u8,
             out: *mut u8,
         );
+        #[cfg(unix)]
+        pub fn blake3_xof_many_avx512(
+            cv: *const u32,
+            block: *const u8,
+            block_len: u8,
+            counter: u64,
+            flags: u8,
+            out: *mut u8,
+            outblocks: usize,
+        );
     }
 }
 
@@ -111,4 +143,13 @@ mod test {
         }
         crate::test::test_hash_many_fn(hash_many, hash_many);
     }
+
+    #[cfg(unix)]
+    #[test]
+    fn test_xof_many() {
+        if !crate::platform::avx512_detected() {
+            return;
+        }
+        crate::test::test_xof_many_fn(xof_many);
+    }
 }
diff --git a/third-party/blake3/src/join.rs b/third-party/blake3/src/join.rs
index 227216a39b..862ebcf9a5 100644
--- a/third-party/blake3/src/join.rs
+++ b/third-party/blake3/src/join.rs
@@ -67,7 +67,7 @@ impl Join for RayonJoin {
         RA: Send,
         RB: Send,
     {
-        rayon::join(oper_a, oper_b)
+        rayon_core::join(oper_a, oper_b)
     }
 }
 
diff --git a/third-party/blake3/src/lib.rs b/third-party/blake3/src/lib.rs
index d661cb2db2..37c2c0b318 100644
--- a/third-party/blake3/src/lib.rs
+++ b/third-party/blake3/src/lib.rs
@@ -138,6 +138,8 @@ use arrayvec::{ArrayString, ArrayVec};
 use core::cmp;
 use core::fmt;
 use platform::{Platform, MAX_SIMD_DEGREE, MAX_SIMD_DEGREE_OR_2};
+#[cfg(feature = "zeroize")]
+use zeroize::Zeroize;
 
 /// The number of bytes in a [`Hash`](struct.Hash.html), 32.
 pub const OUT_LEN: usize = 32;
@@ -216,7 +218,6 @@ fn counter_high(counter: u64) -> u32 {
 /// [`from_hex`]: #method.from_hex
 /// [`Display`]: https://doc.rust-lang.org/std/fmt/trait.Display.html
 /// [`FromStr`]: https://doc.rust-lang.org/std/str/trait.FromStr.html
-#[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
 #[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
 #[derive(Clone, Copy, Hash)]
 pub struct Hash([u8; OUT_LEN]);
@@ -305,28 +306,19 @@ impl core::str::FromStr for Hash {
     }
 }
 
-// A proper implementation of constant time equality is tricky, and we get it from the
-// constant_time_eq crate instead of rolling our own. However, that crate isn't compatible with
-// Miri, so we roll our own just for that.
-#[cfg(miri)]
-fn constant_time_eq_miri(a: &[u8], b: &[u8]) -> bool {
-    if a.len() != b.len() {
-        return false;
+#[cfg(feature = "zeroize")]
+impl Zeroize for Hash {
+    fn zeroize(&mut self) {
+        // Destructuring to trigger compile error as a reminder to update this impl.
+        let Self(bytes) = self;
+        bytes.zeroize();
     }
-    let mut x = 0;
-    for i in 0..a.len() {
-        x |= a[i] ^ b[i];
-    }
-    x == 0
 }
 
 /// This implementation is constant-time.
 impl PartialEq for Hash {
     #[inline]
     fn eq(&self, other: &Hash) -> bool {
-        #[cfg(miri)]
-        return constant_time_eq_miri(&self.0, &other.0);
-        #[cfg(not(miri))]
         constant_time_eq::constant_time_eq_32(&self.0, &other.0)
     }
 }
@@ -335,9 +327,6 @@ impl PartialEq for Hash {
 impl PartialEq<[u8; OUT_LEN]> for Hash {
     #[inline]
     fn eq(&self, other: &[u8; OUT_LEN]) -> bool {
-        #[cfg(miri)]
-        return constant_time_eq_miri(&self.0, other);
-        #[cfg(not(miri))]
         constant_time_eq::constant_time_eq_32(&self.0, other)
     }
 }
@@ -346,9 +335,6 @@ impl PartialEq<[u8; OUT_LEN]> for Hash {
 impl PartialEq<[u8]> for Hash {
     #[inline]
     fn eq(&self, other: &[u8]) -> bool {
-        #[cfg(miri)]
-        return constant_time_eq_miri(&self.0, other);
-        #[cfg(not(miri))]
         constant_time_eq::constant_time_eq(&self.0, other)
     }
 }
@@ -416,7 +402,6 @@ impl std::error::Error for HexError {}
 // Each chunk or parent node can produce either a 32-byte chaining value or, by
 // setting the ROOT flag, any number of final output bytes. The Output struct
 // captures the state just prior to choosing between those two possibilities.
-#[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
 #[derive(Clone)]
 struct Output {
     input_chaining_value: CVWords,
@@ -424,7 +409,6 @@ struct Output {
     block_len: u8,
     counter: u64,
     flags: u8,
-    #[cfg_attr(feature = "zeroize", zeroize(skip))]
     platform: Platform,
 }
 
@@ -460,8 +444,28 @@ impl Output {
     }
 }
 
+#[cfg(feature = "zeroize")]
+impl Zeroize for Output {
+    fn zeroize(&mut self) {
+        // Destructuring to trigger compile error as a reminder to update this impl.
+        let Self {
+            input_chaining_value,
+            block,
+            block_len,
+            counter,
+            flags,
+            platform: _,
+        } = self;
+
+        input_chaining_value.zeroize();
+        block.zeroize();
+        block_len.zeroize();
+        counter.zeroize();
+        flags.zeroize();
+    }
+}
+
 #[derive(Clone)]
-#[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
 struct ChunkState {
     cv: CVWords,
     chunk_counter: u64,
@@ -469,7 +473,6 @@ struct ChunkState {
     buf_len: u8,
     blocks_compressed: u8,
     flags: u8,
-    #[cfg_attr(feature = "zeroize", zeroize(skip))]
     platform: Platform,
 }
 
@@ -572,6 +575,29 @@ impl fmt::Debug for ChunkState {
     }
 }
 
+#[cfg(feature = "zeroize")]
+impl Zeroize for ChunkState {
+    fn zeroize(&mut self) {
+        // Destructuring to trigger compile error as a reminder to update this impl.
+        let Self {
+            cv,
+            chunk_counter,
+            buf,
+            buf_len,
+            blocks_compressed,
+            flags,
+            platform: _,
+        } = self;
+
+        cv.zeroize();
+        chunk_counter.zeroize();
+        buf.zeroize();
+        buf_len.zeroize();
+        blocks_compressed.zeroize();
+        flags.zeroize();
+    }
+}
+
 // IMPLEMENTATION NOTE
 // ===================
 // The recursive function compress_subtree_wide(), implemented below, is the
@@ -854,8 +880,17 @@ fn hash_all_at_once<J: join::Join>(input: &[u8], key: &CVWords, flags: u8) -> Ou
 
 /// The default hash function.
 ///
-/// For an incremental version that accepts multiple writes, see
-/// [`Hasher::update`].
+/// For an incremental version that accepts multiple writes, see [`Hasher::new`],
+/// [`Hasher::update`], and [`Hasher::finalize`]. These two lines are equivalent:
+///
+/// ```
+/// let hash = blake3::hash(b"foo");
+/// # let hash1 = hash;
+///
+/// let hash = blake3::Hasher::new().update(b"foo").finalize();
+/// # let hash2 = hash;
+/// # assert_eq!(hash1, hash2);
+/// ```
 ///
 /// For output sizes other than 32 bytes, see [`Hasher::finalize_xof`] and
 /// [`OutputReader`].
@@ -874,11 +909,22 @@ pub fn hash(input: &[u8]) -> Hash {
 /// requirement, and callers need to be careful not to compare MACs as raw
 /// bytes.
 ///
-/// For output sizes other than 32 bytes, see [`Hasher::new_keyed`],
-/// [`Hasher::finalize_xof`], and [`OutputReader`].
+/// For an incremental version that accepts multiple writes, see [`Hasher::new_keyed`],
+/// [`Hasher::update`], and [`Hasher::finalize`]. These two lines are equivalent:
+///
+/// ```
+/// # const KEY: &[u8; 32] = &[0; 32];
+/// let mac = blake3::keyed_hash(KEY, b"foo");
+/// # let mac1 = mac;
+///
+/// let mac = blake3::Hasher::new_keyed(KEY).update(b"foo").finalize();
+/// # let mac2 = mac;
+/// # assert_eq!(mac1, mac2);
+/// ```
+///
+/// For output sizes other than 32 bytes, see [`Hasher::finalize_xof`], and [`OutputReader`].
 ///
 /// This function is always single-threaded. For multithreading support, see
-/// [`Hasher::new_keyed`] and
 /// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon).
 pub fn keyed_hash(key: &[u8; KEY_LEN], input: &[u8]) -> Hash {
     let key_words = platform::words_from_le_bytes_32(key);
@@ -912,11 +958,25 @@ pub fn keyed_hash(key: &[u8; KEY_LEN], input: &[u8]) -> Hash {
 /// [Argon2]. Password hashes are entirely different from generic hash
 /// functions, with opposite design requirements.
 ///
-/// For output sizes other than 32 bytes, see [`Hasher::new_derive_key`],
-/// [`Hasher::finalize_xof`], and [`OutputReader`].
+/// For an incremental version that accepts multiple writes, see [`Hasher::new_derive_key`],
+/// [`Hasher::update`], and [`Hasher::finalize`]. These two statements are equivalent:
+///
+/// ```
+/// # const CONTEXT: &str = "example.com 2019-12-25 16:18:03 session tokens v1";
+/// let key = blake3::derive_key(CONTEXT, b"key material, not a password");
+/// # let key1 = key;
+///
+/// let key: [u8; 32] = blake3::Hasher::new_derive_key(CONTEXT)
+///     .update(b"key material, not a password")
+///     .finalize()
+///     .into();
+/// # let key2 = key;
+/// # assert_eq!(key1, key2);
+/// ```
+///
+/// For output sizes other than 32 bytes, see [`Hasher::finalize_xof`], and [`OutputReader`].
 ///
 /// This function is always single-threaded. For multithreading support, see
-/// [`Hasher::new_derive_key`] and
 /// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon).
 ///
 /// [Argon2]: https://en.wikipedia.org/wiki/Argon2
@@ -985,7 +1045,6 @@ fn parent_node_output(
 /// # }
 /// ```
 #[derive(Clone)]
-#[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
 pub struct Hasher {
     key: CVWords,
     chunk_state: ChunkState,
@@ -1090,7 +1149,7 @@ impl Hasher {
     //    the root node of the whole tree, and it would need to be ROOT
     //    finalized. We can't compress it until we know.
     // 2) This 64 KiB input might complete a larger tree, whose root node is
-    //    similarly going to be the the root of the whole tree. For example,
+    //    similarly going to be the root of the whole tree. For example,
     //    maybe we have 196 KiB (that is, 128 + 64) hashed so far. We can't
     //    compress the node at the root of the 256 KiB subtree until we know
     //    how to finalize it.
@@ -1532,6 +1591,22 @@ impl std::io::Write for Hasher {
     }
 }
 
+#[cfg(feature = "zeroize")]
+impl Zeroize for Hasher {
+    fn zeroize(&mut self) {
+        // Destructuring to trigger compile error as a reminder to update this impl.
+        let Self {
+            key,
+            chunk_state,
+            cv_stack,
+        } = self;
+
+        key.zeroize();
+        chunk_state.zeroize();
+        cv_stack.zeroize();
+    }
+}
+
 /// An incremental reader for extended output, returned by
 /// [`Hasher::finalize_xof`](struct.Hasher.html#method.finalize_xof).
 ///
@@ -1555,7 +1630,6 @@ impl std::io::Write for Hasher {
 /// from an unknown position in the output stream to recover its block index. Callers with strong
 /// secret keys aren't affected in practice, but secret offsets are a [design
 /// smell](https://en.wikipedia.org/wiki/Design_smell) in any case.
-#[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
 #[derive(Clone)]
 pub struct OutputReader {
     inner: Output,
@@ -1570,6 +1644,23 @@ impl OutputReader {
         }
     }
 
+    // This helper function handles both the case where the output buffer is
+    // shorter than one block, and the case where our position_within_block is
+    // non-zero.
+    fn fill_one_block(&mut self, buf: &mut &mut [u8]) {
+        let output_block: [u8; BLOCK_LEN] = self.inner.root_output_block();
+        let output_bytes = &output_block[self.position_within_block as usize..];
+        let take = cmp::min(buf.len(), output_bytes.len());
+        buf[..take].copy_from_slice(&output_bytes[..take]);
+        self.position_within_block += take as u8;
+        if self.position_within_block == BLOCK_LEN as u8 {
+            self.inner.counter += 1;
+            self.position_within_block = 0;
+        }
+        // Advance the dest buffer. mem::take() is a borrowck workaround.
+        *buf = &mut core::mem::take(buf)[take..];
+    }
+
     /// Fill a buffer with output bytes and advance the position of the
     /// `OutputReader`. This is equivalent to [`Read::read`], except that it
     /// doesn't return a `Result`. Both methods always fill the entire buffer.
@@ -1586,17 +1677,35 @@ impl OutputReader {
     ///
     /// [`Read::read`]: #method.read
     pub fn fill(&mut self, mut buf: &mut [u8]) {
-        while !buf.is_empty() {
-            let block: [u8; BLOCK_LEN] = self.inner.root_output_block();
-            let output_bytes = &block[self.position_within_block as usize..];
-            let take = cmp::min(buf.len(), output_bytes.len());
-            buf[..take].copy_from_slice(&output_bytes[..take]);
-            buf = &mut buf[take..];
-            self.position_within_block += take as u8;
-            if self.position_within_block == BLOCK_LEN as u8 {
-                self.inner.counter += 1;
-                self.position_within_block = 0;
-            }
+        if buf.is_empty() {
+            return;
+        }
+
+        // If we're partway through a block, try to get to a block boundary.
+        if self.position_within_block != 0 {
+            self.fill_one_block(&mut buf);
+        }
+
+        let full_blocks = buf.len() / BLOCK_LEN;
+        let full_blocks_len = full_blocks * BLOCK_LEN;
+        if full_blocks > 0 {
+            debug_assert_eq!(0, self.position_within_block);
+            self.inner.platform.xof_many(
+                &self.inner.input_chaining_value,
+                &self.inner.block,
+                self.inner.block_len,
+                self.inner.counter,
+                self.inner.flags | ROOT,
+                &mut buf[..full_blocks_len],
+            );
+            self.inner.counter += full_blocks as u64;
+            buf = &mut buf[full_blocks * BLOCK_LEN..];
+        }
+
+        if !buf.is_empty() {
+            debug_assert!(buf.len() < BLOCK_LEN);
+            self.fill_one_block(&mut buf);
+            debug_assert!(buf.is_empty());
         }
     }
 
@@ -1667,3 +1776,17 @@ impl std::io::Seek for OutputReader {
         Ok(self.position())
     }
 }
+
+#[cfg(feature = "zeroize")]
+impl Zeroize for OutputReader {
+    fn zeroize(&mut self) {
+        // Destructuring to trigger compile error as a reminder to update this impl.
+        let Self {
+            inner,
+            position_within_block,
+        } = self;
+
+        inner.zeroize();
+        position_within_block.zeroize();
+    }
+}
diff --git a/third-party/blake3/src/platform.rs b/third-party/blake3/src/platform.rs
index 79bc9a3fb8..cd8ef63d2b 100644
--- a/third-party/blake3/src/platform.rs
+++ b/third-party/blake3/src/platform.rs
@@ -277,6 +277,41 @@ impl Platform {
         }
     }
 
+    pub fn xof_many(
+        &self,
+        cv: &CVWords,
+        block: &[u8; BLOCK_LEN],
+        block_len: u8,
+        mut counter: u64,
+        flags: u8,
+        out: &mut [u8],
+    ) {
+        debug_assert_eq!(0, out.len() % BLOCK_LEN, "whole blocks only");
+        if out.is_empty() {
+            // The current assembly implementation always outputs at least 1 block.
+            return;
+        }
+        match self {
+            // Safe because detect() checked for platform support.
+            #[cfg(blake3_avx512_ffi)]
+            #[cfg(unix)]
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            Platform::AVX512 => unsafe {
+                crate::avx512::xof_many(cv, block, block_len, counter, flags, out)
+            },
+            _ => {
+                // For platforms without an optimized xof_many, fall back to a loop over
+                // compress_xof. This is still faster than portable code.
+                for out_block in out.chunks_exact_mut(BLOCK_LEN) {
+                    // TODO: Use array_chunks_mut here once that's stable.
+                    let out_array: &mut [u8; BLOCK_LEN] = out_block.try_into().unwrap();
+                    *out_array = self.compress_xof(cv, block, block_len, counter, flags);
+                    counter += 1;
+                }
+            }
+        }
+    }
+
     // Explicit platform constructors, for benchmarks.
 
     pub fn portable() -> Self {
diff --git a/third-party/blake3/src/test.rs b/third-party/blake3/src/test.rs
index c76cbbc03a..a7ac4fcf6a 100644
--- a/third-party/blake3/src/test.rs
+++ b/third-party/blake3/src/test.rs
@@ -206,6 +206,76 @@ pub fn test_hash_many_fn(
     }
 }
 
+#[allow(unused)]
+type XofManyFunction = unsafe fn(
+    cv: &CVWords,
+    block: &[u8; BLOCK_LEN],
+    block_len: u8,
+    counter: u64,
+    flags: u8,
+    out: &mut [u8],
+);
+
+// A shared helper function for platform-specific tests.
+#[allow(unused)]
+pub fn test_xof_many_fn(xof_many_function: XofManyFunction) {
+    let mut block = [0; BLOCK_LEN];
+    let block_len = 42;
+    crate::test::paint_test_input(&mut block[..block_len]);
+    let cv = [40, 41, 42, 43, 44, 45, 46, 47];
+    let flags = crate::KEYED_HASH;
+
+    // Test a few different initial counter values.
+    // - 0: The base case.
+    // - u32::MAX: The low word of the counter overflows for all inputs except the first.
+    // - i32::MAX: *No* overflow. But carry bugs in tricky SIMD code can screw this up, if you XOR
+    //   when you're supposed to ANDNOT...
+    let initial_counters = [0, u32::MAX as u64, i32::MAX as u64];
+    for counter in initial_counters {
+        #[cfg(feature = "std")]
+        dbg!(counter);
+
+        // 31 (16 + 8 + 4 + 2 + 1) outputs
+        const OUTPUT_SIZE: usize = 31 * BLOCK_LEN;
+
+        let mut portable_out = [0u8; OUTPUT_SIZE];
+        for (i, out_block) in portable_out.chunks_exact_mut(64).enumerate() {
+            out_block.copy_from_slice(&crate::portable::compress_xof(
+                &cv,
+                &block,
+                block_len as u8,
+                counter + i as u64,
+                flags,
+            ));
+        }
+
+        let mut test_out = [0u8; OUTPUT_SIZE];
+        unsafe {
+            xof_many_function(&cv, &block, block_len as u8, counter, flags, &mut test_out);
+        }
+
+        assert_eq!(portable_out, test_out);
+    }
+
+    // Test that xof_many doesn't write more blocks than requested. Note that the current assembly
+    // implementation always outputs at least one block, so we don't test the zero case.
+    for block_count in 1..=32 {
+        let mut array = [0; BLOCK_LEN * 33];
+        let output_start = 17;
+        let output_len = block_count * BLOCK_LEN;
+        let output_end = output_start + output_len;
+        let output = &mut array[output_start..output_end];
+        unsafe {
+            xof_many_function(&cv, &block, block_len as u8, 0, flags, output);
+        }
+        for i in 0..array.len() {
+            if i < output_start || output_end <= i {
+                assert_eq!(0, array[i], "index {i}");
+            }
+        }
+    }
+}
+
 #[test]
 fn test_key_bytes_equal_key_words() {
     assert_eq!(
@@ -373,6 +443,43 @@ fn test_compare_reference_impl() {
     }
 }
 
+#[test]
+fn test_compare_reference_impl_long_xof() {
+    let mut reference_output = [0u8; 32 * BLOCK_LEN - 1];
+    let mut reference_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY);
+    reference_hasher.update(b"hello world");
+    reference_hasher.finalize(&mut reference_output);
+
+    let mut test_output = [0u8; 32 * BLOCK_LEN - 1];
+    let mut test_hasher = crate::Hasher::new_keyed(&TEST_KEY);
+    test_hasher.update(b"hello world");
+    test_hasher.finalize_xof().fill(&mut test_output);
+
+    assert_eq!(reference_output, test_output);
+}
+
+#[test]
+fn test_xof_partial_blocks() {
+    const OUT_LEN: usize = 6 * BLOCK_LEN;
+    let mut reference_out = [0u8; OUT_LEN];
+    reference_impl::Hasher::new().finalize(&mut reference_out);
+
+    let mut all_at_once_out = [0u8; OUT_LEN];
+    crate::Hasher::new()
+        .finalize_xof()
+        .fill(&mut all_at_once_out);
+    assert_eq!(reference_out, all_at_once_out);
+
+    let mut partial_out = [0u8; OUT_LEN];
+    let partial_start = 32;
+    let partial_end = OUT_LEN - 32;
+    let mut xof = crate::Hasher::new().finalize_xof();
+    xof.fill(&mut partial_out[..partial_start]);
+    xof.fill(&mut partial_out[partial_start..partial_end]);
+    xof.fill(&mut partial_out[partial_end..]);
+    assert_eq!(reference_out, partial_out);
+}
+
 fn reference_hash(input: &[u8]) -> crate::Hash {
     let mut hasher = reference_impl::Hasher::new();
     hasher.update(input);
@@ -449,6 +556,42 @@ fn test_fuzz_hasher() {
     }
 }
 
+#[test]
+fn test_fuzz_xof() {
+    let mut input_buf = [0u8; 3 * BLOCK_LEN];
+    paint_test_input(&mut input_buf);
+
+    // Don't do too many iterations in debug mode, to keep the tests under a
+    // second or so. CI should run tests in release mode also. Provide an
+    // environment variable for specifying a larger number of fuzz iterations.
+    let num_tests = if cfg!(debug_assertions) { 100 } else { 2500 };
+
+    // Use a fixed RNG seed for reproducibility.
+    let mut rng = rand_chacha::ChaCha8Rng::from_seed([1; 32]);
+    for _num_test in 0..num_tests {
+        #[cfg(feature = "std")]
+        dbg!(_num_test);
+        // 31 (16 + 8 + 4 + 2 + 1) outputs
+        let mut output_buf = [0; 31 * CHUNK_LEN];
+        let input_len = rng.gen_range(0..input_buf.len());
+        let mut xof = crate::Hasher::new()
+            .update(&input_buf[..input_len])
+            .finalize_xof();
+        let partial_start = rng.gen_range(0..output_buf.len());
+        let partial_end = rng.gen_range(partial_start..output_buf.len());
+        xof.fill(&mut output_buf[..partial_start]);
+        xof.fill(&mut output_buf[partial_start..partial_end]);
+        xof.fill(&mut output_buf[partial_end..]);
+
+        let mut reference_buf = [0; 31 * CHUNK_LEN];
+        let mut reference_hasher = reference_impl::Hasher::new();
+        reference_hasher.update(&input_buf[..input_len]);
+        reference_hasher.finalize(&mut reference_buf);
+
+        assert_eq!(reference_buf, output_buf);
+    }
+}
+
 #[test]
 fn test_xof_seek() {
     let mut out = [0; 533];
@@ -809,14 +952,45 @@ fn test_mmap_rayon() -> Result<(), std::io::Error> {
 #[cfg(feature = "std")]
 #[cfg(feature = "serde")]
 fn test_serde() {
-    let hash: crate::Hash = [7; 32].into();
+    // Henrik suggested that we use 0xfe / 254 for byte test data instead of 0xff / 255, due to the
+    // fact that 0xfe is not a well formed CBOR item.
+    let hash: crate::Hash = [0xfe; 32].into();
+
     let json = serde_json::to_string(&hash).unwrap();
     assert_eq!(
         json,
-        "[7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]",
+        "[254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254]",
     );
     let hash2: crate::Hash = serde_json::from_str(&json).unwrap();
     assert_eq!(hash, hash2);
+
+    let mut cbor = Vec::<u8>::new();
+    ciborium::into_writer(&hash, &mut cbor).unwrap();
+    assert_eq!(
+        cbor,
+        [
+            0x98, 0x20, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe,
+            0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe,
+            0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe,
+            0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe,
+            0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe,
+        ]
+    );
+    let hash_from_cbor: crate::Hash = ciborium::from_reader(&cbor[..]).unwrap();
+    assert_eq!(hash_from_cbor, hash);
+
+    // Version 1.5.2 of this crate changed the default serialization format to a bytestring
+    // (instead of an array/list) to save bytes on the wire. That was a backwards compatibility
+    // mistake for non-self-describing formats, and it's been reverted. Since some small number of
+    // serialized bytestrings will probably exist forever in the wild, we shold test that we can
+    // still deserialize these from self-describing formats.
+    let bytestring_cbor: &[u8] = &[
+        0x58, 0x20, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
+        0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
+        0xfe, 0xfe, 0xfe, 0xfe,
+    ];
+    let hash_from_bytestring_cbor: crate::Hash = ciborium::from_reader(bytestring_cbor).unwrap();
+    assert_eq!(hash_from_bytestring_cbor, hash);
 }
 
 // `cargo +nightly miri test` currently works, but it takes forever, because some of our test
diff --git a/third-party/blake3/tools/release.md b/third-party/blake3/tools/release.md
index 924f32791d..0d05eba50d 100644
--- a/third-party/blake3/tools/release.md
+++ b/third-party/blake3/tools/release.md
@@ -12,5 +12,4 @@
 - `git tag` the version bump commit with the new version number.
 - `git push --tags`
 - `cargo publish` in the root.
-- `cargo publish --dry-run` in b3sum/ and make sure it fetches the just-published library version.
 - `cargo publish` in b3sum/.
diff --git a/third-party/mimalloc/.gitignore b/third-party/mimalloc/.gitignore
index df1d58eb2e..b2439f94b2 100644
--- a/third-party/mimalloc/.gitignore
+++ b/third-party/mimalloc/.gitignore
@@ -1,7 +1,6 @@
 ide/vs20??/*.db
 ide/vs20??/*.opendb
 ide/vs20??/*.user
-ide/vs20??/*.vcxproj.filters
 ide/vs20??/.vs
 ide/vs20??/VTune*
 out/
diff --git a/third-party/mimalloc/CMakeLists.txt b/third-party/mimalloc/CMakeLists.txt
index bcfe91d867..5ca281e19b 100644
--- a/third-party/mimalloc/CMakeLists.txt
+++ b/third-party/mimalloc/CMakeLists.txt
@@ -7,18 +7,20 @@ set(CMAKE_CXX_STANDARD 17)
 option(MI_SECURE            "Use full security mitigations (like guard pages, allocation randomization, double-free mitigation, and free-list corruption detection)" OFF)
 option(MI_DEBUG_FULL        "Use full internal heap invariant checking in DEBUG mode (expensive)" OFF)
 option(MI_PADDING           "Enable padding to detect heap block overflow (always on in DEBUG or SECURE mode, or with Valgrind/ASAN)" OFF)
-option(MI_OVERRIDE          "Override the standard malloc interface (e.g. define entry points for malloc() etc)" ON)
+option(MI_OVERRIDE          "Override the standard malloc interface (i.e. define entry points for 'malloc', 'free', etc)" ON)
 option(MI_XMALLOC           "Enable abort() call on memory allocation failure by default" OFF)
 option(MI_SHOW_ERRORS       "Show error and warning messages by default (only enabled by default in DEBUG mode)" OFF)
 option(MI_TRACK_VALGRIND    "Compile with Valgrind support (adds a small overhead)" OFF)
 option(MI_TRACK_ASAN        "Compile with address sanitizer support (adds a small overhead)" OFF)
 option(MI_TRACK_ETW         "Compile with Windows event tracing (ETW) support (adds a small overhead)" OFF)
 option(MI_USE_CXX           "Use the C++ compiler to compile the library (instead of the C compiler)" OFF)
+option(MI_OPT_ARCH          "Only for optimized builds: turn on architecture specific optimizations (for arm64: '-march=armv8.1-a' (2016))" ON)
 option(MI_SEE_ASM           "Generate assembly files" OFF)
 option(MI_OSX_INTERPOSE     "Use interpose to override standard malloc on macOS" ON)
 option(MI_OSX_ZONE          "Use malloc zone to override standard malloc on macOS" ON)
 option(MI_WIN_REDIRECT      "Use redirection module ('mimalloc-redirect') on Windows if compiling mimalloc as a DLL" ON)
-option(MI_LOCAL_DYNAMIC_TLS "Use slightly slower, dlopen-compatible TLS mechanism (Unix)" OFF)
+option(MI_WIN_USE_FIXED_TLS "Use a fixed TLS slot on Windows to avoid extra tests in the malloc fast path" OFF)
+option(MI_LOCAL_DYNAMIC_TLS "Use local-dynamic-tls, a slightly slower but dlopen-compatible thread local storage mechanism (Unix)" OFF)
 option(MI_LIBC_MUSL         "Set this when linking with musl libc" OFF)
 option(MI_BUILD_SHARED      "Build shared library" ON)
 option(MI_BUILD_STATIC      "Build static library" ON)
@@ -26,12 +28,15 @@ option(MI_BUILD_OBJECT      "Build object library" ON)
 option(MI_BUILD_TESTS       "Build test executables" ON)
 option(MI_DEBUG_TSAN        "Build with thread sanitizer (needs clang)" OFF)
 option(MI_DEBUG_UBSAN       "Build with undefined-behavior sanitizer (needs clang++)" OFF)
+option(MI_GUARDED           "Build with guard pages behind certain object allocations (implies MI_NO_PADDING=ON)" OFF)
 option(MI_SKIP_COLLECT_ON_EXIT "Skip collecting memory on program exit" OFF)
 option(MI_NO_PADDING        "Force no use of padding even in DEBUG mode etc." OFF)
 option(MI_INSTALL_TOPLEVEL  "Install directly into $CMAKE_INSTALL_PREFIX instead of PREFIX/lib/mimalloc-version" OFF)
 option(MI_NO_THP            "Disable transparent huge pages support on Linux/Android for the mimalloc process only" OFF)
+option(MI_EXTRA_CPPDEFS     "Extra pre-processor definitions (use as `-DMI_EXTRA_CPPDEFS=\"opt1=val1;opt2=val2\"`)" "")
 
 # deprecated options
+option(MI_WIN_USE_FLS       "Use Fiber local storage on Windows to detect thread termination (deprecated)" OFF)
 option(MI_CHECK_FULL        "Use full internal invariant checking in DEBUG mode (deprecated, use MI_DEBUG_FULL instead)" OFF)
 option(MI_USE_LIBATOMIC     "Explicitly link with -latomic (on older systems) (deprecated and detected automatically)" OFF)
 
@@ -61,37 +66,53 @@ set(mi_sources
 set(mi_cflags "")
 set(mi_cflags_static "")            # extra flags for a static library build
 set(mi_cflags_dynamic "")           # extra flags for a shared-object library build
-set(mi_defines "")                   
 set(mi_libraries "")
 
+if(MI_EXTRA_CPPDEFS)
+ set(mi_defines ${MI_EXTRA_CPPDEFS})
+else()
+ set(mi_defines "")
+endif()
+
 # -----------------------------------------------------------------------------
-# Convenience: set default build type depending on the build directory
+# Convenience: set default build type and compiler depending on the build directory
 # -----------------------------------------------------------------------------
 
 message(STATUS "")
 if (NOT CMAKE_BUILD_TYPE)
-  if ("${CMAKE_BINARY_DIR}" MATCHES ".*(D|d)ebug$" OR  MI_DEBUG_FULL)
-    message(STATUS "No build type selected, default to: Debug")
+  if ("${CMAKE_BINARY_DIR}" MATCHES ".*((D|d)ebug|asan|tsan|ubsan|valgrind)$" OR MI_DEBUG_FULL)
+    message(STATUS "No build type selected, default to 'Debug'")
     set(CMAKE_BUILD_TYPE "Debug")
   else()
-    message(STATUS "No build type selected, default to: Release")
+    message(STATUS "No build type selected, default to 'Release'")
     set(CMAKE_BUILD_TYPE "Release")
   endif()
 endif()
 
+if (CMAKE_GENERATOR MATCHES "^Visual Studio.*$")
+  message(STATUS "Note: when building with Visual Studio the build type is specified when building.")
+  message(STATUS "For example: 'cmake --build . --config=Release")
+endif()
+
 if("${CMAKE_BINARY_DIR}" MATCHES ".*(S|s)ecure$")
   message(STATUS "Default to secure build")
   set(MI_SECURE "ON")
 endif()
 
-
 # -----------------------------------------------------------------------------
 # Process options
 # -----------------------------------------------------------------------------
+if(CMAKE_C_COMPILER_ID STREQUAL "Clang" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT STREQUAL "MSVC")
+  set(MI_CLANG_CL "ON")
+endif()
 
 # put -Wall early so other warnings can be disabled selectively
 if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang")
-  list(APPEND mi_cflags -Wall -Wextra -Wpedantic)
+  if (MI_CLANG_CL)
+    list(APPEND mi_cflags -W)
+  else()
+    list(APPEND mi_cflags -Wall -Wextra -Wpedantic)
+  endif()
 endif()
 if(CMAKE_C_COMPILER_ID MATCHES "GNU")
     list(APPEND mi_cflags -Wall -Wextra)
@@ -104,6 +125,14 @@ if(CMAKE_C_COMPILER_ID MATCHES "MSVC|Intel")
   set(MI_USE_CXX "ON")
 endif()
 
+if(CMAKE_BUILD_TYPE MATCHES "Release|RelWithDebInfo")
+  if (NOT MI_OPT_ARCH)
+    message(STATUS "Architecture specific optimizations are disabled (MI_OPT_ARCH=OFF)")
+  endif()
+else()
+  set(MI_OPT_ARCH OFF)
+endif()
+
 if(MI_OVERRIDE)
   message(STATUS "Override standard malloc (MI_OVERRIDE=ON)")
   if(APPLE)
@@ -131,12 +160,6 @@ if(MI_OVERRIDE)
 endif()
 
 if(WIN32)
-  if (MI_WIN_REDIRECT)
-    if (MSVC_C_ARCHITECTURE_ID MATCHES "ARM")
-      message(STATUS "Cannot use redirection on Windows ARM (MI_WIN_REDIRECT=OFF)")
-      set(MI_WIN_REDIRECT OFF)
-    endif()
-  endif()
   if (NOT MI_WIN_REDIRECT)
     # use a negative define for backward compatibility
     list(APPEND mi_defines MI_WIN_NOREDIRECT=1)
@@ -152,8 +175,8 @@ if(MI_TRACK_VALGRIND)
   CHECK_INCLUDE_FILES("valgrind/valgrind.h;valgrind/memcheck.h" MI_HAS_VALGRINDH)
   if (NOT MI_HAS_VALGRINDH)
     set(MI_TRACK_VALGRIND OFF)
-    message(WARNING "Cannot find the 'valgrind/valgrind.h' and 'valgrind/memcheck.h' -- install valgrind first")
-    message(STATUS  "Compile **without** Valgrind support (MI_TRACK_VALGRIND=OFF)")
+    message(WARNING "Cannot find the 'valgrind/valgrind.h' and 'valgrind/memcheck.h' -- install valgrind first?")
+    message(STATUS  "Disabling Valgrind support (MI_TRACK_VALGRIND=OFF)")
   else()
     message(STATUS "Compile with Valgrind support (MI_TRACK_VALGRIND=ON)")
     list(APPEND mi_defines MI_TRACK_VALGRIND=1)
@@ -199,6 +222,15 @@ if(MI_TRACK_ETW)
   endif()
 endif()
 
+if(MI_GUARDED)
+  message(STATUS "Compile guard pages behind certain object allocations (MI_GUARDED=ON)")
+  list(APPEND mi_defines MI_GUARDED=1)
+  if(NOT MI_NO_PADDING)
+    message(STATUS "  Disabling padding due to guard pages (MI_NO_PADDING=ON)")
+    set(MI_NO_PADDING ON)
+  endif()
+endif()
+
 if(MI_SEE_ASM)
   message(STATUS "Generate assembly listings (MI_SEE_ASM=ON)")
   list(APPEND mi_cflags -save-temps)
@@ -258,6 +290,7 @@ if(MI_DEBUG_UBSAN)
   if(CMAKE_BUILD_TYPE MATCHES "Debug")
     if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
       message(STATUS "Build with undefined-behavior sanitizer (MI_DEBUG_UBSAN=ON)")
+      list(APPEND mi_defines MI_UBSAN=1)
       list(APPEND mi_cflags -fsanitize=undefined -g -fno-sanitize-recover=undefined)
       list(APPEND mi_libraries -fsanitize=undefined)
       if (NOT MI_USE_CXX)
@@ -296,6 +329,48 @@ if(MI_LIBC_MUSL)
   list(APPEND mi_defines MI_LIBC_MUSL=1)
 endif()
 
+if(MI_WIN_USE_FLS)
+  message(STATUS "Use the Fiber API to detect thread termination (deprecated) (MI_WIN_USE_FLS=ON)")
+  list(APPEND mi_defines MI_WIN_USE_FLS=1)
+endif()
+
+if(MI_WIN_USE_FIXED_TLS)
+  message(STATUS "Use fixed TLS slot on Windows to avoid extra tests in the malloc fast path (MI_WIN_USE_FIXED_TLS=ON)")
+  list(APPEND mi_defines MI_WIN_USE_FIXED_TLS=1)
+endif()
+
+# Determine architecture
+set(MI_OPT_ARCH_FLAGS "")
+set(MI_ARCH "unknown")
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86|i[3456]86)$" OR CMAKE_GENERATOR_PLATFORM MATCHES "^(x86|Win32)$")
+  set(MI_ARCH "x86")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|x64|amd64|AMD64)$" OR CMAKE_GENERATOR_PLATFORM STREQUAL "x64") # must be before arm64
+  set(MI_ARCH "x64")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64|armv8.?|ARM64)$" OR CMAKE_GENERATOR_PLATFORM STREQUAL "ARM64")
+  set(MI_ARCH "arm64")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|armv[34567]|ARM)$")
+  set(MI_ARCH "arm32")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv|riscv32|riscv64)$")
+  if(CMAKE_SIZEOF_VOID_P==4)
+    set(MI_ARCH "riscv32")
+  else()
+    set(MI_ARCH "riscv64")
+  endif()
+else()
+  set(MI_ARCH ${CMAKE_SYSTEM_PROCESSOR})
+endif()
+message(STATUS "Architecture: ${MI_ARCH}") # (${CMAKE_SYSTEM_PROCESSOR}, ${CMAKE_GENERATOR_PLATFORM}, ${CMAKE_GENERATOR})")
+
+# Check /proc/cpuinfo for an SV39 MMU and limit the virtual address bits.
+# (this will skip the aligned hinting in that case. Issue #939, #949)
+if (EXISTS /proc/cpuinfo)
+  file(STRINGS /proc/cpuinfo mi_sv39_mmu REGEX "^mmu[ \t]+:[ \t]+sv39$")
+  if (mi_sv39_mmu)
+    MESSAGE( STATUS "Set virtual address bits to 39 (SV39 MMU detected)" )
+    list(APPEND mi_defines MI_DEFAULT_VIRTUAL_ADDRESS_BITS=39)
+  endif()
+endif()
+
 # On Haiku use `-DCMAKE_INSTALL_PREFIX` instead, issue #788
 # if(CMAKE_SYSTEM_NAME MATCHES "Haiku")
 #   SET(CMAKE_INSTALL_LIBDIR ~/config/non-packaged/lib)
@@ -303,21 +378,21 @@ endif()
 # endif()
 
 # Compiler flags
-if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU")
+if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU" AND NOT MI_CLANG_CL)
   list(APPEND mi_cflags -Wno-unknown-pragmas -fvisibility=hidden)
   if(NOT MI_USE_CXX)
     list(APPEND mi_cflags -Wstrict-prototypes)
   endif()
   if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang")
     list(APPEND mi_cflags -Wno-static-in-inline)
-  endif()
+  endif()  
 endif()
 
 if(CMAKE_C_COMPILER_ID MATCHES "Intel")
   list(APPEND mi_cflags -fvisibility=hidden)
 endif()
 
-if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM_NAME MATCHES "Haiku")
+if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM_NAME MATCHES "Haiku" AND NOT MI_CLANG_CL)
   if(MI_LOCAL_DYNAMIC_TLS)
     list(APPEND mi_cflags -ftls-model=local-dynamic)
   else()
@@ -327,7 +402,7 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM
       list(APPEND mi_cflags_dynamic -ftls-model=initial-exec)
       message(STATUS "Use local dynamic TLS for the static build (since MI_LIBC_MUSL=ON)")
     else()
-      list(APPEND mi_cflags -ftls-model=initial-exec)    
+      list(APPEND mi_cflags -ftls-model=initial-exec)
     endif()
   endif()
   if(MI_OVERRIDE)
@@ -335,28 +410,46 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM
   endif()
 endif()
 
+if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM_NAME MATCHES "Haiku")
+  if(MI_OPT_ARCH)
+    if(MI_ARCH STREQUAL "arm64")
+      set(MI_OPT_ARCH_FLAGS "-march=armv8.1-a")         # fast atomics
+    endif()
+  endif()
+endif()
+
 if (MSVC AND MSVC_VERSION GREATER_EQUAL 1914)
   list(APPEND mi_cflags /Zc:__cplusplus)
+  if(MI_OPT_ARCH AND NOT MI_CLANG_CL)
+    if(MI_ARCH STREQUAL "arm64")
+      set(MI_OPT_ARCH_FLAGS "/arch:armv8.1")           # fast atomics      
+    endif()
+  endif()
 endif()
 
 if(MINGW)
-  add_definitions(-D_WIN32_WINNT=0x600)
+  add_definitions(-D_WIN32_WINNT=0x601)                # issue #976
+endif()
+
+if(MI_OPT_ARCH_FLAGS)
+  list(APPEND mi_cflags ${MI_OPT_ARCH_FLAGS})
+  message(STATUS "Architecture specific optimization is enabled (with ${MI_OPT_ARCH_FLAGS}) (MI_OPT_ARCH=ON)")
 endif()
 
 # extra needed libraries
 
-# we prefer -l<lib> test over `find_library` as sometimes core libraries 
+# we prefer -l<lib> test over `find_library` as sometimes core libraries
 # like `libatomic` are not on the system path (see issue #898)
-function(find_link_library libname outlibname)  
-  check_linker_flag(C "-l${libname}" mi_has_lib${libname})  
+function(find_link_library libname outlibname)
+  check_linker_flag(C "-l${libname}" mi_has_lib${libname})
   if (mi_has_lib${libname})
     message(VERBOSE "link library: -l${libname}")
-    set(${outlibname} ${libname} PARENT_SCOPE)    
+    set(${outlibname} ${libname} PARENT_SCOPE)
   else()
     find_library(MI_LIBPATH libname)
     if (MI_LIBPATH)
       message(VERBOSE "link library ${libname} at ${MI_LIBPATH}")
-      set(${outlibname} ${MI_LIBPATH} PARENT_SCOPE)      
+      set(${outlibname} ${MI_LIBPATH} PARENT_SCOPE)
     else()
       message(VERBOSE "link library not found: ${libname}")
       set(${outlibname} "" PARENT_SCOPE)
@@ -365,19 +458,19 @@ function(find_link_library libname outlibname)
 endfunction()
 
 if(WIN32)
-  list(APPEND mi_libraries psapi shell32 user32 advapi32 bcrypt)  
+  list(APPEND mi_libraries psapi shell32 user32 advapi32 bcrypt)
 else()
   find_link_library("pthread" MI_LIB_PTHREAD)
-  if(MI_LIB_PTHREAD) 
+  if(MI_LIB_PTHREAD)
     list(APPEND mi_libraries "${MI_LIB_PTHREAD}")
   endif()
   find_link_library("rt" MI_LIB_RT)
-  if(MI_LIB_RT) 
+  if(MI_LIB_RT)
     list(APPEND mi_libraries "${MI_LIB_RT}")
   endif()
   find_link_library("atomic" MI_LIB_ATOMIC)
-  if(MI_LIB_ATOMIC) 
-    list(APPEND mi_libraries "${MI_LIB_ATOMIC}")  
+  if(MI_LIB_ATOMIC)
+    list(APPEND mi_libraries "${MI_LIB_ATOMIC}")
   endif()
 endif()
 
@@ -431,7 +524,7 @@ endif()
 
 message(STATUS "")
 message(STATUS "Library base name: ${mi_basename}")
-message(STATUS "Version          : ${mi_version}")
+message(STATUS "Version          : ${mi_version}.${mi_version_patch}")
 message(STATUS "Build type       : ${CMAKE_BUILD_TYPE_LC}")
 if(MI_USE_CXX)
   message(STATUS "C++ Compiler     : ${CMAKE_CXX_COMPILER}")
@@ -461,10 +554,18 @@ if(MI_BUILD_SHARED)
   )
   if(WIN32 AND MI_WIN_REDIRECT)
     # On windows, link and copy the mimalloc redirection dll too.
-    if(CMAKE_SIZEOF_VOID_P EQUAL 4)
+    if(CMAKE_GENERATOR_PLATFORM STREQUAL "arm64ec")
+      set(MIMALLOC_REDIRECT_SUFFIX "-arm64ec")
+    elseif(MI_ARCH STREQUAL "x64")
+      set(MIMALLOC_REDIRECT_SUFFIX "")
+      if(CMAKE_SYSTEM_PROCESSOR STREQUAL "ARM64")
+        message(STATUS "Note: x64 code emulated on Windows for arm64 should use an arm64ec build of 'mimalloc-override.dll'")
+        message(STATUS "      with 'mimalloc-redirect-arm64ec.dll'. See the 'bin\\readme.md' for more information.")
+      endif()
+    elseif(MI_ARCH STREQUAL "x86")
       set(MIMALLOC_REDIRECT_SUFFIX "32")
     else()
-      set(MIMALLOC_REDIRECT_SUFFIX "")
+      set(MIMALLOC_REDIRECT_SUFFIX "-${MI_ARCH}")  # -arm64 etc.
     endif()
 
     target_link_libraries(mimalloc PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/bin/mimalloc-redirect${MIMALLOC_REDIRECT_SUFFIX}.lib)
diff --git a/third-party/mimalloc/LICENSE b/third-party/mimalloc/LICENSE
index 670b668a0c..53315ebee5 100644
--- a/third-party/mimalloc/LICENSE
+++ b/third-party/mimalloc/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2018-2021 Microsoft Corporation, Daan Leijen
+Copyright (c) 2018-2025 Microsoft Corporation, Daan Leijen
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/third-party/mimalloc/azure-pipelines.yml b/third-party/mimalloc/azure-pipelines.yml
index 0247c76fd5..ae4f65ffa1 100644
--- a/third-party/mimalloc/azure-pipelines.yml
+++ b/third-party/mimalloc/azure-pipelines.yml
@@ -8,14 +8,15 @@ trigger:
     include:
     - master
     - dev
-    - dev-slice
+    - dev2
+    - dev3
   tags:
     include:
     - v*
 
 jobs:
 - job:
-  displayName: Windows
+  displayName: Windows 2022
   pool:
     vmImage:
       windows-2022
@@ -43,7 +44,7 @@ jobs:
       solution: $(BuildType)/libmimalloc.sln
       configuration: '$(MSBuildConfiguration)'
       msbuildArguments: -m
-  - script: ctest --verbose --timeout 120 -C $(MSBuildConfiguration)
+  - script: ctest --verbose --timeout 240 -C $(MSBuildConfiguration)
     workingDirectory: $(BuildType)
     displayName: CTest
   #- script: $(BuildType)\$(BuildType)\mimalloc-test-stress
@@ -52,7 +53,7 @@ jobs:
   #  artifact: mimalloc-windows-$(BuildType)
 
 - job:
-  displayName: Linux
+  displayName: Ubuntu 22.04
   pool:
     vmImage:
      ubuntu-22.04
@@ -112,8 +113,13 @@ jobs:
         CC: clang
         CXX: clang++
         BuildType: debug-tsan-clang-cxx
-        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_USE_CXX=ON -DMI_DEBUG_TSAN=ON
-      
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=RelWithDebInfo -DMI_USE_CXX=ON -DMI_DEBUG_TSAN=ON
+      Debug Guarded Clang:
+        CC: clang
+        CXX: clang
+        BuildType: debug-guarded-clang
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=RelWithDebInfo -DMI_DEBUG_FULL=ON -DMI_GUARDED=ON
+
   steps:
   - task: CMake@1
     inputs:
@@ -121,17 +127,19 @@ jobs:
       cmakeArgs: .. $(cmakeExtraArgs)
   - script: make -j$(nproc) -C $(BuildType)
     displayName: Make
-  - script: ctest --verbose --timeout 180
+  - script: ctest --verbose --timeout 240
     workingDirectory: $(BuildType)
     displayName: CTest
+    env:
+      MIMALLOC_GUARDED_SAMPLE_RATE: 1000
 #  - upload: $(Build.SourcesDirectory)/$(BuildType)
 #    artifact: mimalloc-ubuntu-$(BuildType)
 
 - job:
-  displayName: macOS
+  displayName: macOS 14 (Sonoma)
   pool:
     vmImage:
-      macOS-latest
+      macOS-14
   strategy:
     matrix:
       Debug:
@@ -150,48 +158,152 @@ jobs:
       cmakeArgs: .. $(cmakeExtraArgs)
   - script: make -j$(sysctl -n hw.ncpu) -C $(BuildType)
     displayName: Make
-  # - script: MIMALLOC_VERBOSE=1 ./mimalloc-test-api
-  #   workingDirectory: $(BuildType)
-  #   displayName: TestAPI
-  # - script: MIMALLOC_VERBOSE=1 ./mimalloc-test-stress
-  #   workingDirectory: $(BuildType)
-  #   displayName: TestStress    
-  - script: ctest --verbose --timeout 120
+  - script: ctest --verbose --timeout 240
     workingDirectory: $(BuildType)
     displayName: CTest
     
 #  - upload: $(Build.SourcesDirectory)/$(BuildType)
 #    artifact: mimalloc-macos-$(BuildType)
 
-# - job:
-#   displayName: Windows-2017
-#   pool:
-#     vmImage:
-#       vs2017-win2016
-#   strategy:
-#     matrix:
-#       Debug:
-#         BuildType: debug
-#         cmakeExtraArgs: -A x64 -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON
-#         MSBuildConfiguration: Debug
-#       Release:
-#         BuildType: release
-#         cmakeExtraArgs: -A x64 -DCMAKE_BUILD_TYPE=Release
-#         MSBuildConfiguration: Release
-#       Secure:
-#         BuildType: secure
-#         cmakeExtraArgs: -A x64 -DCMAKE_BUILD_TYPE=Release -DMI_SECURE=ON
-#         MSBuildConfiguration: Release
-#   steps:
-#   - task: CMake@1
-#     inputs:
-#       workingDirectory: $(BuildType)
-#       cmakeArgs: .. $(cmakeExtraArgs)
-#   - task: MSBuild@1
-#     inputs:
-#       solution: $(BuildType)/libmimalloc.sln
-#       configuration: '$(MSBuildConfiguration)'
-#   - script: |
-#       cd $(BuildType)
-#       ctest --verbose --timeout 120
-#     displayName: CTest
+# ----------------------------------------------------------
+# Other OS versions (just debug mode)
+# ----------------------------------------------------------
+
+- job:
+  displayName: Windows 2019
+  pool:
+    vmImage:
+      windows-2019
+  strategy:
+    matrix:
+      Debug:
+        BuildType: debug
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON
+        MSBuildConfiguration: Debug
+      Release:
+        BuildType: release
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release
+        MSBuildConfiguration: Release
+  steps:
+  - task: CMake@1
+    inputs:
+      workingDirectory: $(BuildType)
+      cmakeArgs: .. $(cmakeExtraArgs)
+  - task: MSBuild@1
+    inputs:
+      solution: $(BuildType)/libmimalloc.sln
+      configuration: '$(MSBuildConfiguration)'
+      msbuildArguments: -m
+  - script: ctest --verbose --timeout 240 -C $(MSBuildConfiguration)
+    workingDirectory: $(BuildType)
+    displayName: CTest
+
+- job:
+  displayName: Ubuntu 24.04
+  pool:
+    vmImage:
+      ubuntu-24.04
+  strategy:
+    matrix:
+      Debug:
+        CC: gcc
+        CXX: g++
+        BuildType: debug
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON
+      Debug++:
+        CC: gcc
+        CXX: g++
+        BuildType: debug-cxx
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON -DMI_USE_CXX=ON
+      Debug Clang:
+        CC: clang
+        CXX: clang++
+        BuildType: debug-clang
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON
+      Debug++ Clang:
+        CC: clang
+        CXX: clang++
+        BuildType: debug-clang-cxx
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON -DMI_USE_CXX=ON
+      Release Clang:
+        CC: clang
+        CXX: clang++
+        BuildType: release-clang
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release
+  steps:
+  - task: CMake@1
+    inputs:
+      workingDirectory: $(BuildType)
+      cmakeArgs: .. $(cmakeExtraArgs)
+  - script: make -j$(nproc) -C $(BuildType)
+    displayName: Make
+  - script: ctest --verbose --timeout 240
+    workingDirectory: $(BuildType)
+    displayName: CTest
+
+- job:
+  displayName: Ubuntu 20.04
+  pool:
+    vmImage:
+      ubuntu-20.04
+  strategy:
+    matrix:
+      Debug:
+        CC: gcc
+        CXX: g++
+        BuildType: debug
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON
+      Debug++:
+        CC: gcc
+        CXX: g++
+        BuildType: debug-cxx
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON -DMI_USE_CXX=ON
+      Debug Clang:
+        CC: clang
+        CXX: clang++
+        BuildType: debug-clang
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON
+      Debug++ Clang:
+        CC: clang
+        CXX: clang++
+        BuildType: debug-clang-cxx
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON -DMI_USE_CXX=ON
+      Release Clang:
+        CC: clang
+        CXX: clang++
+        BuildType: release-clang
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release
+  steps:
+  - task: CMake@1
+    inputs:
+      workingDirectory: $(BuildType)
+      cmakeArgs: .. $(cmakeExtraArgs)
+  - script: make -j$(nproc) -C $(BuildType)
+    displayName: Make
+  - script: ctest --verbose --timeout 240
+    workingDirectory: $(BuildType)
+    displayName: CTest
+
+- job:
+  displayName: macOS 15 (Sequoia)
+  pool:
+    vmImage:
+      macOS-15
+  strategy:
+    matrix:
+      Debug:
+        BuildType: debug
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON
+      Release:
+        BuildType: release
+        cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release
+  steps:
+  - task: CMake@1
+    inputs:
+      workingDirectory: $(BuildType)
+      cmakeArgs: .. $(cmakeExtraArgs)
+  - script: make -j$(sysctl -n hw.ncpu) -C $(BuildType)
+    displayName: Make
+  - script: ctest --verbose --timeout 240
+    workingDirectory: $(BuildType)
+    displayName: CTest
diff --git a/third-party/mimalloc/bin/mimalloc-redirect-arm64.dll b/third-party/mimalloc/bin/mimalloc-redirect-arm64.dll
new file mode 100644
index 0000000000..455f8394df
Binary files /dev/null and b/third-party/mimalloc/bin/mimalloc-redirect-arm64.dll differ
diff --git a/third-party/mimalloc/bin/mimalloc-redirect-arm64.lib b/third-party/mimalloc/bin/mimalloc-redirect-arm64.lib
new file mode 100644
index 0000000000..0445ce8361
Binary files /dev/null and b/third-party/mimalloc/bin/mimalloc-redirect-arm64.lib differ
diff --git a/third-party/mimalloc/bin/mimalloc-redirect-arm64ec.dll b/third-party/mimalloc/bin/mimalloc-redirect-arm64ec.dll
new file mode 100644
index 0000000000..62569b57fe
Binary files /dev/null and b/third-party/mimalloc/bin/mimalloc-redirect-arm64ec.dll differ
diff --git a/third-party/mimalloc/bin/mimalloc-redirect-arm64ec.lib b/third-party/mimalloc/bin/mimalloc-redirect-arm64ec.lib
new file mode 100644
index 0000000000..eb724d7405
Binary files /dev/null and b/third-party/mimalloc/bin/mimalloc-redirect-arm64ec.lib differ
diff --git a/third-party/mimalloc/bin/mimalloc-redirect.dll b/third-party/mimalloc/bin/mimalloc-redirect.dll
index a3a3591ff9..7d0ec33b88 100644
Binary files a/third-party/mimalloc/bin/mimalloc-redirect.dll and b/third-party/mimalloc/bin/mimalloc-redirect.dll differ
diff --git a/third-party/mimalloc/bin/mimalloc-redirect.lib b/third-party/mimalloc/bin/mimalloc-redirect.lib
index de128bb948..851455a50a 100644
Binary files a/third-party/mimalloc/bin/mimalloc-redirect.lib and b/third-party/mimalloc/bin/mimalloc-redirect.lib differ
diff --git a/third-party/mimalloc/bin/mimalloc-redirect32.dll b/third-party/mimalloc/bin/mimalloc-redirect32.dll
index 522723e501..cc661036ce 100644
Binary files a/third-party/mimalloc/bin/mimalloc-redirect32.dll and b/third-party/mimalloc/bin/mimalloc-redirect32.dll differ
diff --git a/third-party/mimalloc/bin/mimalloc-redirect32.lib b/third-party/mimalloc/bin/mimalloc-redirect32.lib
index 87f19b8ec0..45d7297d8e 100644
Binary files a/third-party/mimalloc/bin/mimalloc-redirect32.lib and b/third-party/mimalloc/bin/mimalloc-redirect32.lib differ
diff --git a/third-party/mimalloc/bin/minject-arm64.exe b/third-party/mimalloc/bin/minject-arm64.exe
new file mode 100644
index 0000000000..637c95d915
Binary files /dev/null and b/third-party/mimalloc/bin/minject-arm64.exe differ
diff --git a/third-party/mimalloc/bin/minject.exe b/third-party/mimalloc/bin/minject.exe
index dba8f80fd2..bb445706b6 100644
Binary files a/third-party/mimalloc/bin/minject.exe and b/third-party/mimalloc/bin/minject.exe differ
diff --git a/third-party/mimalloc/bin/minject32.exe b/third-party/mimalloc/bin/minject32.exe
index f837383b98..6dcb8da9cc 100644
Binary files a/third-party/mimalloc/bin/minject32.exe and b/third-party/mimalloc/bin/minject32.exe differ
diff --git a/third-party/mimalloc/bin/readme.md b/third-party/mimalloc/bin/readme.md
index 9b121bda59..bc115ce160 100644
--- a/third-party/mimalloc/bin/readme.md
+++ b/third-party/mimalloc/bin/readme.md
@@ -1,27 +1,30 @@
 # Windows Override
 
 <span id="override_on_windows">Dynamically overriding on mimalloc on Windows</span> 
-is robust and has the particular advantage to be able to redirect all malloc/free calls that go through
-the (dynamic) C runtime allocator, including those from other DLL's or libraries.
-As it intercepts all allocation calls on a low level, it can be used reliably 
+is robust and has the particular advantage to be able to redirect all malloc/free calls 
+that go through the (dynamic) C runtime allocator, including those from other DLL's or 
+libraries. As it intercepts all allocation calls on a low level, it can be used reliably 
 on large programs that include other 3rd party components.
-There are four requirements to make the overriding work robustly:
+There are four requirements to make the overriding work well:
 
 1. Use the C-runtime library as a DLL (using the `/MD` or `/MDd` switch).
 
-2. Link your program explicitly with `mimalloc-override.dll` library.
-   To ensure the `mimalloc-override.dll` is loaded at run-time it is easiest to insert some
-    call to the mimalloc API in the `main` function, like `mi_version()`
-    (or use the `/INCLUDE:mi_version` switch on the linker). See the `mimalloc-override-test` project
-    for an example on how to use this. 
+2. Link your program explicitly with the `mimalloc.lib` export library for
+   the `mimalloc.dll` -- which contains all mimalloc functionality.
+   To ensure the `mimalloc.dll` is actually loaded at run-time it is easiest 
+   to insert some call to the mimalloc API in the `main` function, like `mi_version()`
+   (or use the `/include:mi_version` switch on the linker, or
+   similarly, `#pragma comment(linker, "/include:mi_version")` in some source file). 
+   See the `mimalloc-test-override` project for an example on how to use this. 
 
-3. The `mimalloc-redirect.dll` (or `mimalloc-redirect32.dll`) must be put
-   in the same folder as the main `mimalloc-override.dll` at runtime (as it is a dependency of that DLL).
-   The redirection DLL ensures that all calls to the C runtime malloc API get redirected to
-   mimalloc functions (which reside in `mimalloc-override.dll`).
+3. The `mimalloc-redirect.dll` must be put in the same folder as the main 
+   `mimalloc.dll` at runtime (as it is a dependency of that DLL).
+   The redirection DLL ensures that all calls to the C runtime malloc API get 
+   redirected to mimalloc functions (which reside in `mimalloc.dll`).
 
-4. Ensure the `mimalloc-override.dll` comes as early as possible in the import
+4. Ensure the `mimalloc.dll` comes as early as possible in the import
    list of the final executable (so it can intercept all potential allocations).
+   You can use `minject -l <exe>` to check this if needed.
 
 For best performance on Windows with C++, it
 is also recommended to also override the `new`/`delete` operations (by including
@@ -29,18 +32,43 @@ is also recommended to also override the `new`/`delete` operations (by including
 a single(!) source file in your project).
 
 The environment variable `MIMALLOC_DISABLE_REDIRECT=1` can be used to disable dynamic
-overriding at run-time. Use `MIMALLOC_VERBOSE=1` to check if mimalloc was successfully redirected.
+overriding at run-time. Use `MIMALLOC_VERBOSE=1` to check if mimalloc was successfully 
+redirected.
 
-## Minject
+### Other Platforms
 
-We cannot always re-link an executable with `mimalloc-override.dll`, and similarly, we cannot always
-ensure the the DLL comes first in the import table of the final executable.
+You always link with `mimalloc.dll` but for different platforms you may 
+need a specific redirection DLL:
+
+- __x64__: `mimalloc-redirect.dll`.
+- __x86__: `mimalloc-redirect32.dll`. Use for older 32-bit Windows programs.
+- __arm64__: `mimalloc-redirect-arm64.dll`. Use for native Windows arm64 programs.
+- __arm64ec__: `mimalloc-redirect-arm64ec.dll`. The [arm64ec] ABI is "emulation compatible" 
+  mode on Windows arm64. Unfortunately we cannot run x64 code emulated on Windows arm64 with
+  the x64 mimalloc override directly (since the C runtime always uses `arm64ec`). Instead:
+  1. Build the program as normal for x64 and link as normal with the x64 
+     `mimalloc.lib` export library.
+  2. Now separately build `mimalloc.dll` in `arm64ec` mode and _overwrite_ your
+     previous (x64) `mimalloc.dll` -- the loader can handle the mix of arm64ec
+     and x64 code. Now use `mimalloc-redirect-arm64ec.dll` to match your new
+     arm64ec `mimalloc.dll`. The main program stays as is and can be fully x64 
+     or contain more arm64ec modules. At runtime, the arm64ec `mimalloc.dll` will
+     run with native arm64 instructions while the rest of the program runs emulated x64.
+
+[arm64ec]: https://learn.microsoft.com/en-us/windows/arm/arm64ec
+
+
+### Minject
+
+We cannot always re-link an executable with `mimalloc.dll`, and similarly, we 
+cannot always ensure that the DLL comes first in the import table of the final executable.
 In many cases though we can patch existing executables without any recompilation
-if they are linked with the dynamic C runtime (`ucrtbase.dll`) -- just put the `mimalloc-override.dll`
-into the import table (and put `mimalloc-redirect.dll` in the same folder)
-Such patching can be done for example with [CFF Explorer](https://ntcore.com/?page_id=388).
+if they are linked with the dynamic C runtime (`ucrtbase.dll`) -- just put the 
+`mimalloc.dll` into the import table (and put `mimalloc-redirect.dll` in the same 
+directory) Such patching can be done for example with [CFF Explorer](https://ntcore.com/?page_id=388).
 
-The `minject` program can also do this from the command line, use `minject --help` for options:
+The `minject` program can also do this from the command line
+Use `minject --help` for options:
 
 ```
 > minject --help
@@ -58,8 +86,8 @@ options:
   -l   --list        only list imported modules
   -i   --inplace     update the exe in-place (make sure there is a backup!)
   -f   --force       always overwrite without prompting
-       --postfix=<p> use <p> as a postfix to the mimalloc dll (default is 'override')
-                     e.g. use --postfix=override-debug to link with mimalloc-override-debug.dll
+       --postfix=<p> use <p> as a postfix to the mimalloc dll.
+                     e.g. use --postfix=debug to link with mimalloc-debug.dll
 
 notes:
   Without '--inplace' an injected <exe> is generated with the same name ending in '-mi'.
@@ -69,3 +97,6 @@ examples:
   > minject --list myprogram.exe
   > minject --force --inplace myprogram.exe
 ```  
+
+For x86 32-bit binaries, use `minject32`, and for arm64 binaries use `minject-arm64`.
+
diff --git a/third-party/mimalloc/cmake/mimalloc-config-version.cmake b/third-party/mimalloc/cmake/mimalloc-config-version.cmake
index 81fd3c9da7..f3ed36ab5b 100644
--- a/third-party/mimalloc/cmake/mimalloc-config-version.cmake
+++ b/third-party/mimalloc/cmake/mimalloc-config-version.cmake
@@ -1,6 +1,6 @@
 set(mi_version_major 2)
 set(mi_version_minor 1)
-set(mi_version_patch 7)
+set(mi_version_patch 9)
 set(mi_version ${mi_version_major}.${mi_version_minor})
 
 set(PACKAGE_VERSION ${mi_version})
diff --git a/third-party/mimalloc/doc/doxyfile b/third-party/mimalloc/doc/doxyfile
index d03a70f57c..53f874cfb2 100644
--- a/third-party/mimalloc/doc/doxyfile
+++ b/third-party/mimalloc/doc/doxyfile
@@ -1,4 +1,4 @@
-# Doxyfile 1.9.1
+# Doxyfile 1.11.0
 
 # This file describes the settings to be used by the documentation system
 # doxygen (www.doxygen.org) for a project.
@@ -12,6 +12,16 @@
 # For lists, items can also be appended using:
 # TAG += value [value, ...]
 # Values that contain spaces should be placed between quotes (\" \").
+#
+# Note:
+#
+# Use doxygen to compare the used configuration file with the template
+# configuration file:
+# doxygen -x [configFile]
+# Use doxygen to compare the used configuration file with the template
+# configuration file without replacing the environment variables or CMake type
+# replacement variables:
+# doxygen -x_noenv [configFile]
 
 #---------------------------------------------------------------------------
 # Project related configuration options
@@ -53,6 +63,12 @@ PROJECT_BRIEF          =
 
 PROJECT_LOGO           = mimalloc-logo.svg
 
+# With the PROJECT_ICON tag one can specify an icon that is included in the tabs
+# when the HTML document is shown. Doxygen will copy the logo to the output
+# directory.
+
+PROJECT_ICON           =
+
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
 # into which the generated documentation will be written. If a relative path is
 # entered, it will be relative to the location where doxygen was started. If
@@ -60,16 +76,28 @@ PROJECT_LOGO           = mimalloc-logo.svg
 
 OUTPUT_DIRECTORY       = ..
 
-# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
-# directories (in 2 levels) under the output directory of each output format and
-# will distribute the generated files over these directories. Enabling this
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create up to 4096
+# sub-directories (in 2 levels) under the output directory of each output format
+# and will distribute the generated files over these directories. Enabling this
 # option can be useful when feeding doxygen a huge amount of source files, where
 # putting all generated files in the same directory would otherwise causes
-# performance problems for the file system.
+# performance problems for the file system. Adapt CREATE_SUBDIRS_LEVEL to
+# control the number of sub-directories.
 # The default value is: NO.
 
 CREATE_SUBDIRS         = NO
 
+# Controls the number of sub-directories that will be created when
+# CREATE_SUBDIRS tag is set to YES. Level 0 represents 16 directories, and every
+# level increment doubles the number of directories, resulting in 4096
+# directories at level 8 which is the default and also the maximum value. The
+# sub-directories are organized in 2 levels, the first level always has a fixed
+# number of 16 directories.
+# Minimum value: 0, maximum value: 8, default value: 8.
+# This tag requires that the tag CREATE_SUBDIRS is set to YES.
+
+CREATE_SUBDIRS_LEVEL   = 8
+
 # If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
 # characters to appear in the names of generated files. If set to NO, non-ASCII
 # characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
@@ -81,26 +109,18 @@ ALLOW_UNICODE_NAMES    = NO
 # The OUTPUT_LANGUAGE tag is used to specify the language in which all
 # documentation generated by doxygen is written. Doxygen will use this
 # information to generate all constant output in the proper language.
-# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
-# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
-# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
-# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
-# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
-# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
-# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
-# Ukrainian and Vietnamese.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Bulgarian,
+# Catalan, Chinese, Chinese-Traditional, Croatian, Czech, Danish, Dutch, English
+# (United States), Esperanto, Farsi (Persian), Finnish, French, German, Greek,
+# Hindi, Hungarian, Indonesian, Italian, Japanese, Japanese-en (Japanese with
+# English messages), Korean, Korean-en (Korean with English messages), Latvian,
+# Lithuanian, Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese,
+# Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish,
+# Swedish, Turkish, Ukrainian and Vietnamese.
 # The default value is: English.
 
 OUTPUT_LANGUAGE        = English
 
-# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all
-# documentation generated by doxygen is written. Doxygen will use this
-# information to generate all generated output in the proper direction.
-# Possible values are: None, LTR, RTL and Context.
-# The default value is: None.
-
-OUTPUT_TEXT_DIRECTION  = None
-
 # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
 # descriptions after the members that are listed in the file and class
 # documentation (similar to Javadoc). Set to NO to disable this.
@@ -258,16 +278,16 @@ TAB_SIZE               = 2
 # the documentation. An alias has the form:
 # name=value
 # For example adding
-# "sideeffect=@par Side Effects:\n"
+# "sideeffect=@par Side Effects:^^"
 # will allow you to put the command \sideeffect (or @sideeffect) in the
 # documentation, which will result in a user-defined paragraph with heading
-# "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines (in the resulting output). You can put ^^ in the value part of an
-# alias to insert a newline as if a physical newline was in the original file.
-# When you need a literal { or } or , in the value part of an alias you have to
-# escape them by means of a backslash (\), this can lead to conflicts with the
-# commands \{ and \} for these it is advised to use the version @{ and @} or use
-# a double escape (\\{ and \\})
+# "Side Effects:". Note that you cannot put \n's in the value part of an alias
+# to insert newlines (in the resulting output). You can put ^^ in the value part
+# of an alias to insert a newline as if a physical newline was in the original
+# file. When you need a literal { or } or , in the value part of an alias you
+# have to escape them by means of a backslash (\), this can lead to conflicts
+# with the commands \{ and \} for these it is advised to use the version @{ and
+# @} or use a double escape (\\{ and \\})
 
 ALIASES                =
 
@@ -312,8 +332,8 @@ OPTIMIZE_OUTPUT_SLICE  = NO
 # extension. Doxygen has a built-in mapping, but you can override or extend it
 # using this tag. The format is ext=language, where ext is a file extension, and
 # language is one of the parsers supported by doxygen: IDL, Java, JavaScript,
-# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice, VHDL,
-# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
+# Csharp (C#), C, C++, Lex, D, PHP, md (Markdown), Objective-C, Python, Slice,
+# VHDL, Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
 # FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
 # tries to guess whether the code is fixed or free formatted code, this is the
 # default for Fortran type files). For instance to make doxygen treat .inc files
@@ -344,11 +364,22 @@ MARKDOWN_SUPPORT       = YES
 # to that level are automatically included in the table of contents, even if
 # they do not have an id attribute.
 # Note: This feature currently applies only to Markdown headings.
-# Minimum value: 0, maximum value: 99, default value: 5.
+# Minimum value: 0, maximum value: 99, default value: 6.
 # This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
 
 TOC_INCLUDE_HEADINGS   = 0
 
+# The MARKDOWN_ID_STYLE tag can be used to specify the algorithm used to
+# generate identifiers for the Markdown headings. Note: Every identifier is
+# unique.
+# Possible values are: DOXYGEN use a fixed 'autotoc_md' string followed by a
+# sequence number starting at 0 and GITHUB use the lower case version of title
+# with any whitespace replaced by '-' and punctuation characters removed.
+# The default value is: DOXYGEN.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+MARKDOWN_ID_STYLE      = DOXYGEN
+
 # When enabled doxygen tries to link words that correspond to documented
 # classes, or namespaces to their corresponding documentation. Such a link can
 # be prevented in individual cases by putting a % sign in front of the word or
@@ -361,8 +392,8 @@ AUTOLINK_SUPPORT       = YES
 # to include (a tag file for) the STL sources as input, then you should set this
 # tag to YES in order to let doxygen match functions declarations and
 # definitions whose arguments contain STL classes (e.g. func(std::string);
-# versus func(std::string) {}). This also make the inheritance and collaboration
-# diagrams that involve STL classes more complete and accurate.
+# versus func(std::string) {}). This also makes the inheritance and
+# collaboration diagrams that involve STL classes more complete and accurate.
 # The default value is: NO.
 
 BUILTIN_STL_SUPPORT    = NO
@@ -374,9 +405,9 @@ BUILTIN_STL_SUPPORT    = NO
 CPP_CLI_SUPPORT        = NO
 
 # Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
-# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
-# will parse them like normal C++ but will assume all classes use public instead
-# of private inheritance when no explicit protection keyword is present.
+# https://www.riverbankcomputing.com/software) sources only. Doxygen will parse
+# them like normal C++ but will assume all classes use public instead of private
+# inheritance when no explicit protection keyword is present.
 # The default value is: NO.
 
 SIP_SUPPORT            = NO
@@ -460,7 +491,7 @@ TYPEDEF_HIDES_STRUCT   = YES
 
 LOOKUP_CACHE_SIZE      = 0
 
-# The NUM_PROC_THREADS specifies the number threads doxygen is allowed to use
+# The NUM_PROC_THREADS specifies the number of threads doxygen is allowed to use
 # during processing. When set to 0 doxygen will based this on the number of
 # cores available in the system. You can set it explicitly to a value larger
 # than 0 to get more control over the balance between CPU load and processing
@@ -473,6 +504,14 @@ LOOKUP_CACHE_SIZE      = 0
 
 NUM_PROC_THREADS       = 1
 
+# If the TIMESTAMP tag is set different from NO then each generated page will
+# contain the date or date and time when the page was generated. Setting this to
+# NO can help when comparing the output of multiple runs.
+# Possible values are: YES, NO, DATETIME and DATE.
+# The default value is: NO.
+
+TIMESTAMP              = NO
+
 #---------------------------------------------------------------------------
 # Build related configuration options
 #---------------------------------------------------------------------------
@@ -554,7 +593,8 @@ HIDE_UNDOC_MEMBERS     = NO
 # If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
 # undocumented classes that are normally visible in the class hierarchy. If set
 # to NO, these classes will be included in the various overviews. This option
-# has no effect if EXTRACT_ALL is enabled.
+# will also hide undocumented C++ concepts if enabled. This option has no effect
+# if EXTRACT_ALL is enabled.
 # The default value is: NO.
 
 HIDE_UNDOC_CLASSES     = NO
@@ -585,14 +625,15 @@ INTERNAL_DOCS          = NO
 # filesystem is case sensitive (i.e. it supports files in the same directory
 # whose names only differ in casing), the option must be set to YES to properly
 # deal with such files in case they appear in the input. For filesystems that
-# are not case sensitive the option should be be set to NO to properly deal with
+# are not case sensitive the option should be set to NO to properly deal with
 # output files written for symbols that only differ in casing, such as for two
 # classes, one named CLASS and the other named Class, and to also support
 # references to files without having to specify the exact matching casing. On
 # Windows (including Cygwin) and MacOS, users should typically set this option
 # to NO, whereas on Linux or other Unix flavors it should typically be set to
 # YES.
-# The default value is: system dependent.
+# Possible values are: SYSTEM, NO and YES.
+# The default value is: SYSTEM.
 
 CASE_SENSE_NAMES       = NO
 
@@ -610,6 +651,12 @@ HIDE_SCOPE_NAMES       = NO
 
 HIDE_COMPOUND_REFERENCE= NO
 
+# If the SHOW_HEADERFILE tag is set to YES then the documentation for a class
+# will show which file needs to be included to use the class.
+# The default value is: YES.
+
+SHOW_HEADERFILE        = YES
+
 # If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
 # the files that are included by a file in the documentation of that file.
 # The default value is: YES.
@@ -767,7 +814,8 @@ FILE_VERSION_FILTER    =
 # output files in an output format independent way. To create the layout file
 # that represents doxygen's defaults, run doxygen with the -l option. You can
 # optionally specify a file name after the option, if omitted DoxygenLayout.xml
-# will be used as the name of the layout file.
+# will be used as the name of the layout file. See also section "Changing the
+# layout of pages" for information.
 #
 # Note that if you run doxygen from a directory containing a file called
 # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
@@ -813,27 +861,50 @@ WARNINGS               = YES
 WARN_IF_UNDOCUMENTED   = YES
 
 # If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
-# potential errors in the documentation, such as not documenting some parameters
-# in a documented function, or documenting parameters that don't exist or using
-# markup commands wrongly.
+# potential errors in the documentation, such as documenting some parameters in
+# a documented function twice, or documenting parameters that don't exist or
+# using markup commands wrongly.
 # The default value is: YES.
 
 WARN_IF_DOC_ERROR      = YES
 
+# If WARN_IF_INCOMPLETE_DOC is set to YES, doxygen will warn about incomplete
+# function parameter documentation. If set to NO, doxygen will accept that some
+# parameters have no documentation without warning.
+# The default value is: YES.
+
+WARN_IF_INCOMPLETE_DOC = YES
+
 # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
 # are documented, but have no documentation for their parameters or return
-# value. If set to NO, doxygen will only warn about wrong or incomplete
-# parameter documentation, but not about the absence of documentation. If
-# EXTRACT_ALL is set to YES then this flag will automatically be disabled.
+# value. If set to NO, doxygen will only warn about wrong parameter
+# documentation, but not about the absence of documentation. If EXTRACT_ALL is
+# set to YES then this flag will automatically be disabled. See also
+# WARN_IF_INCOMPLETE_DOC
 # The default value is: NO.
 
 WARN_NO_PARAMDOC       = NO
 
+# If WARN_IF_UNDOC_ENUM_VAL option is set to YES, doxygen will warn about
+# undocumented enumeration values. If set to NO, doxygen will accept
+# undocumented enumeration values. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: NO.
+
+WARN_IF_UNDOC_ENUM_VAL = NO
+
 # If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
 # a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS
 # then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but
 # at the end of the doxygen process doxygen will return with a non-zero status.
-# Possible values are: NO, YES and FAIL_ON_WARNINGS.
+# If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS_PRINT then doxygen behaves
+# like FAIL_ON_WARNINGS but in case no WARN_LOGFILE is defined doxygen will not
+# write the warning messages in between other messages but write them at the end
+# of a run, in case a WARN_LOGFILE is defined the warning messages will be
+# besides being in the defined file also be shown at the end of a run, unless
+# the WARN_LOGFILE is defined as - i.e. standard output (stdout) in that case
+# the behavior will remain as with the setting FAIL_ON_WARNINGS.
+# Possible values are: NO, YES, FAIL_ON_WARNINGS and FAIL_ON_WARNINGS_PRINT.
 # The default value is: NO.
 
 WARN_AS_ERROR          = NO
@@ -844,13 +915,27 @@ WARN_AS_ERROR          = NO
 # and the warning text. Optionally the format may contain $version, which will
 # be replaced by the version of the file (if it could be obtained via
 # FILE_VERSION_FILTER)
+# See also: WARN_LINE_FORMAT
 # The default value is: $file:$line: $text.
 
 WARN_FORMAT            = "$file:$line: $text"
 
+# In the $text part of the WARN_FORMAT command it is possible that a reference
+# to a more specific place is given. To make it easier to jump to this place
+# (outside of doxygen) the user can define a custom "cut" / "paste" string.
+# Example:
+# WARN_LINE_FORMAT = "'vi $file +$line'"
+# See also: WARN_FORMAT
+# The default value is: at line $line of file $file.
+
+WARN_LINE_FORMAT       = "at line $line of file $file"
+
 # The WARN_LOGFILE tag can be used to specify a file to which warning and error
 # messages should be written. If left blank the output is written to standard
-# error (stderr).
+# error (stderr). In case the file specified cannot be opened for writing the
+# warning and error messages are written to standard error. When as file - is
+# specified the warning and error messages are written to standard output
+# (stdout).
 
 WARN_LOGFILE           =
 
@@ -871,10 +956,21 @@ INPUT                  = mimalloc-doc.h
 # libiconv (or the iconv built into libc) for the transcoding. See the libiconv
 # documentation (see:
 # https://www.gnu.org/software/libiconv/) for the list of possible encodings.
+# See also: INPUT_FILE_ENCODING
 # The default value is: UTF-8.
 
 INPUT_ENCODING         = UTF-8
 
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses The INPUT_FILE_ENCODING tag can be used to specify
+# character encoding on a per file pattern basis. Doxygen will compare the file
+# name with each pattern and apply the encoding instead of the default
+# INPUT_ENCODING) if there is a match. The character encodings are a list of the
+# form: pattern=encoding (like *.php=ISO-8859-1).
+# See also: INPUT_ENCODING for further information on supported encodings.
+
+INPUT_FILE_ENCODING    =
+
 # If the value of the INPUT tag contains directories, you can use the
 # FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
 # *.h) to filter out the source-files in the directories.
@@ -886,12 +982,12 @@ INPUT_ENCODING         = UTF-8
 # Note the list of default checked file patterns might differ from the list of
 # default file extension mappings.
 #
-# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
-# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
-# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
-# *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment),
-# *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd, *.vhdl,
-# *.ucf, *.qsf and *.ice.
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cxxm,
+# *.cpp, *.cppm, *.ccm, *.c++, *.c++m, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl,
+# *.idl, *.ddl, *.odl, *.h, *.hh, *.hxx, *.hpp, *.h++, *.ixx, *.l, *.cs, *.d,
+# *.php, *.php4, *.php5, *.phtml, *.inc, *.m, *.markdown, *.md, *.mm, *.dox (to
+# be provided as doxygen C comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
+# *.f18, *.f, *.for, *.vhd, *.vhdl, *.ucf, *.qsf and *.ice.
 
 FILE_PATTERNS          = *.c \
                          *.cc \
@@ -973,10 +1069,7 @@ EXCLUDE_PATTERNS       =
 # (namespaces, classes, functions, etc.) that should be excluded from the
 # output. The symbol name can be a fully qualified name, a word, or if the
 # wildcard * is used, a substring. Examples: ANamespace, AClass,
-# AClass::ANamespace, ANamespace::*Test
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories use the pattern */test/*
+# ANamespace::AClass, ANamespace::*Test
 
 EXCLUDE_SYMBOLS        =
 
@@ -1021,6 +1114,11 @@ IMAGE_PATH             =
 # code is scanned, but not when the output code is generated. If lines are added
 # or removed, the anchors will not be placed correctly.
 #
+# Note that doxygen will use the data processed and written to standard output
+# for further processing, therefore nothing else, like debug statements or used
+# commands (so in case of a Windows batch file always use @echo OFF), should be
+# written to standard output.
+#
 # Note that for custom extensions or not directly supported extensions you also
 # need to set EXTENSION_MAPPING for the extension otherwise the files are not
 # properly processed by doxygen.
@@ -1062,6 +1160,15 @@ FILTER_SOURCE_PATTERNS =
 
 USE_MDFILE_AS_MAINPAGE =
 
+# The Fortran standard specifies that for fixed formatted Fortran code all
+# characters from position 72 are to be considered as comment. A common
+# extension is to allow longer lines before the automatic comment starts. The
+# setting FORTRAN_COMMENT_AFTER will also make it possible that longer lines can
+# be processed before the automatic comment starts.
+# Minimum value: 7, maximum value: 10000, default value: 72.
+
+FORTRAN_COMMENT_AFTER  = 72
+
 #---------------------------------------------------------------------------
 # Configuration options related to source browsing
 #---------------------------------------------------------------------------
@@ -1076,7 +1183,8 @@ USE_MDFILE_AS_MAINPAGE =
 SOURCE_BROWSER         = NO
 
 # Setting the INLINE_SOURCES tag to YES will include the body of functions,
-# classes and enums directly into the documentation.
+# multi-line macros, enums or list initialized variables directly into the
+# documentation.
 # The default value is: NO.
 
 INLINE_SOURCES         = NO
@@ -1159,9 +1267,11 @@ VERBATIM_HEADERS       = YES
 
 CLANG_ASSISTED_PARSING = NO
 
-# If clang assisted parsing is enabled and the CLANG_ADD_INC_PATHS tag is set to
-# YES then doxygen will add the directory of each input to the include path.
+# If the CLANG_ASSISTED_PARSING tag is set to YES and the CLANG_ADD_INC_PATHS
+# tag is set to YES then doxygen will add the directory of each input to the
+# include path.
 # The default value is: YES.
+# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
 
 CLANG_ADD_INC_PATHS    = YES
 
@@ -1197,10 +1307,11 @@ CLANG_DATABASE_PATH    =
 
 ALPHABETICAL_INDEX     = YES
 
-# In case all classes in a project start with a common prefix, all classes will
-# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
-# can be used to specify a prefix (or a list of prefixes) that should be ignored
-# while generating the index headers.
+# The IGNORE_PREFIX tag can be used to specify a prefix (or a list of prefixes)
+# that should be ignored while generating the index headers. The IGNORE_PREFIX
+# tag works for classes, function and member names. The entity will be placed in
+# the alphabetical list under the first letter of the entity name that remains
+# after removing the prefix.
 # This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
 
 IGNORE_PREFIX          =
@@ -1279,7 +1390,12 @@ HTML_STYLESHEET        =
 # Doxygen will copy the style sheet files to the output directory.
 # Note: The order of the extra style sheet files is of importance (e.g. the last
 # style sheet in the list overrules the setting of the previous ones in the
-# list). For an example see the documentation.
+# list).
+# Note: Since the styling of scrollbars can currently not be overruled in
+# Webkit/Chromium, the styling will be left out of the default doxygen.css if
+# one or more extra stylesheets have been specified. So if scrollbar
+# customization is desired it has to be added explicitly. For an example see the
+# documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
 HTML_EXTRA_STYLESHEET  = mimalloc-doxygen.css
@@ -1294,9 +1410,22 @@ HTML_EXTRA_STYLESHEET  = mimalloc-doxygen.css
 
 HTML_EXTRA_FILES       =
 
+# The HTML_COLORSTYLE tag can be used to specify if the generated HTML output
+# should be rendered with a dark or light theme.
+# Possible values are: LIGHT always generates light mode output, DARK always
+# generates dark mode output, AUTO_LIGHT automatically sets the mode according
+# to the user preference, uses light mode if no preference is set (the default),
+# AUTO_DARK automatically sets the mode according to the user preference, uses
+# dark mode if no preference is set and TOGGLE allows a user to switch between
+# light and dark mode via a button.
+# The default value is: AUTO_LIGHT.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE        = LIGHT
+
 # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
 # will adjust the colors in the style sheet and background images according to
-# this color. Hue is specified as an angle on a colorwheel, see
+# this color. Hue is specified as an angle on a color-wheel, see
 # https://en.wikipedia.org/wiki/Hue for more information. For instance the value
 # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
 # purple, and 360 is red again.
@@ -1306,7 +1435,7 @@ HTML_EXTRA_FILES       =
 HTML_COLORSTYLE_HUE    = 189
 
 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
-# in the HTML output. For a value of 0 the output will use grayscales only. A
+# in the HTML output. For a value of 0 the output will use gray-scales only. A
 # value of 255 will produce the most vivid colors.
 # Minimum value: 0, maximum value: 255, default value: 100.
 # This tag requires that the tag GENERATE_HTML is set to YES.
@@ -1324,15 +1453,6 @@ HTML_COLORSTYLE_SAT    = 12
 
 HTML_COLORSTYLE_GAMMA  = 240
 
-# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
-# page will contain the date and time when the page was generated. Setting this
-# to YES can help to show when doxygen was last run and thus if the
-# documentation is up to date.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_TIMESTAMP         = NO
-
 # If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
 # documentation will contain a main index with vertical navigation menus that
 # are dynamically created via JavaScript. If disabled, the navigation index will
@@ -1352,6 +1472,33 @@ HTML_DYNAMIC_MENUS     = NO
 
 HTML_DYNAMIC_SECTIONS  = NO
 
+# If the HTML_CODE_FOLDING tag is set to YES then classes and functions can be
+# dynamically folded and expanded in the generated HTML source code.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_CODE_FOLDING      = YES
+
+# If the HTML_COPY_CLIPBOARD tag is set to YES then doxygen will show an icon in
+# the top right corner of code and text fragments that allows the user to copy
+# its content to the clipboard. Note this only works if supported by the browser
+# and the web page is served via a secure context (see:
+# https://www.w3.org/TR/secure-contexts/), i.e. using the https: or file:
+# protocol.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COPY_CLIPBOARD    = YES
+
+# Doxygen stores a couple of settings persistently in the browser (via e.g.
+# cookies). By default these settings apply to all HTML pages generated by
+# doxygen across all projects. The HTML_PROJECT_COOKIE tag can be used to store
+# the settings under a project specific key, such that the user preferences will
+# be stored separately.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_PROJECT_COOKIE    =
+
 # With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
 # shown in the various tree structured indices initially; the user can expand
 # and collapse entries dynamically later on. Doxygen will expand the tree to
@@ -1388,6 +1535,13 @@ GENERATE_DOCSET        = NO
 
 DOCSET_FEEDNAME        = "Doxygen generated docs"
 
+# This tag determines the URL of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDURL         =
+
 # This tag specifies a string that should uniquely identify the documentation
 # set bundle. This should be a reverse domain-name style string, e.g.
 # com.mycompany.MyDocSet. Doxygen will append .docset to the name.
@@ -1413,8 +1567,12 @@ DOCSET_PUBLISHER_NAME  = Publisher
 # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
 # additional HTML index files: index.hhp, index.hhc, and index.hhk. The
 # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see:
-# https://www.microsoft.com/en-us/download/details.aspx?id=21138) on Windows.
+# on Windows. In the beginning of 2021 Microsoft took the original page, with
+# a.o. the download links, offline the HTML help workshop was already many years
+# in maintenance mode). You can download the HTML help workshop from the web
+# archives at Installation executable (see:
+# http://web.archive.org/web/20160201063255/http://download.microsoft.com/downlo
+# ad/0/A/9/0A939EF6-E31C-430F-A3DF-DFAE7960D564/htmlhelp.exe).
 #
 # The HTML Help Workshop contains a compiler that can convert all HTML output
 # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
@@ -1471,6 +1629,16 @@ BINARY_TOC             = NO
 
 TOC_EXPAND             = NO
 
+# The SITEMAP_URL tag is used to specify the full URL of the place where the
+# generated documentation will be placed on the server by the user during the
+# deployment of the documentation. The generated sitemap is called sitemap.xml
+# and placed on the directory specified by HTML_OUTPUT. In case no SITEMAP_URL
+# is specified no sitemap is generated. For information about the sitemap
+# protocol see https://www.sitemaps.org
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SITEMAP_URL            =
+
 # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
 # QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
 # can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
@@ -1573,16 +1741,28 @@ DISABLE_INDEX          = YES
 # to work a browser that supports JavaScript, DHTML, CSS and frames is required
 # (i.e. any modern browser). Windows users are probably better off using the
 # HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
-# further fine-tune the look of the index. As an example, the default style
-# sheet generated by doxygen has an example that shows how to put an image at
-# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
-# the same information as the tab index, you could consider setting
-# DISABLE_INDEX to YES when enabling this option.
+# further fine tune the look of the index (see "Fine-tuning the output"). As an
+# example, the default style sheet generated by doxygen has an example that
+# shows how to put an image at the root of the tree instead of the PROJECT_NAME.
+# Since the tree basically has the same information as the tab index, you could
+# consider setting DISABLE_INDEX to YES when enabling this option.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
 GENERATE_TREEVIEW      = YES
 
+# When both GENERATE_TREEVIEW and DISABLE_INDEX are set to YES, then the
+# FULL_SIDEBAR option determines if the side bar is limited to only the treeview
+# area (value NO) or if it should extend to the full height of the window (value
+# YES). Setting this to YES gives a layout similar to
+# https://docs.readthedocs.io with more room for contents, but less room for the
+# project logo, title, and description. If either GENERATE_TREEVIEW or
+# DISABLE_INDEX is set to NO, this option has no effect.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FULL_SIDEBAR           = NO
+
 # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
 # doxygen will group on one line in the generated HTML documentation.
 #
@@ -1607,6 +1787,13 @@ TREEVIEW_WIDTH         = 180
 
 EXT_LINKS_IN_WINDOW    = NO
 
+# If the OBFUSCATE_EMAILS tag is set to YES, doxygen will obfuscate email
+# addresses.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+OBFUSCATE_EMAILS       = YES
+
 # If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg
 # tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see
 # https://inkscape.org) to generate formulas as SVG images instead of PNGs for
@@ -1627,17 +1814,6 @@ HTML_FORMULA_FORMAT    = png
 
 FORMULA_FONTSIZE       = 10
 
-# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
-# generated for formulas are transparent PNGs. Transparent PNGs are not
-# supported properly for IE 6.0, but are supported on all modern browsers.
-#
-# Note that when changing this option you need to delete any form_*.png files in
-# the HTML output directory before the changes have effect.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_TRANSPARENT    = YES
-
 # The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands
 # to create new LaTeX commands to be used in formulas as building blocks. See
 # the section "Including formulas" for details.
@@ -1655,11 +1831,29 @@ FORMULA_MACROFILE      =
 
 USE_MATHJAX            = NO
 
+# With MATHJAX_VERSION it is possible to specify the MathJax version to be used.
+# Note that the different versions of MathJax have different requirements with
+# regards to the different settings, so it is possible that also other MathJax
+# settings have to be changed when switching between the different MathJax
+# versions.
+# Possible values are: MathJax_2 and MathJax_3.
+# The default value is: MathJax_2.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_VERSION        = MathJax_2
+
 # When MathJax is enabled you can set the default output format to be used for
-# the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details.
+# the MathJax output. For more details about the output format see MathJax
+# version 2 (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) and MathJax version 3
+# (see:
+# http://docs.mathjax.org/en/latest/web/components/output.html).
 # Possible values are: HTML-CSS (which is slower, but has the best
-# compatibility), NativeMML (i.e. MathML) and SVG.
+# compatibility. This is the name for Mathjax version 2, for MathJax version 3
+# this will be translated into chtml), NativeMML (i.e. MathML. Only supported
+# for MathJax 2. For MathJax version 3 chtml will be used instead.), chtml (This
+# is the name for Mathjax version 3, for MathJax version 2 this will be
+# translated into HTML-CSS) and SVG.
 # The default value is: HTML-CSS.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
@@ -1672,15 +1866,21 @@ MATHJAX_FORMAT         = HTML-CSS
 # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
 # Content Delivery Network so you can quickly see the result without installing
 # MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from https://www.mathjax.org before deployment.
-# The default value is: https://cdn.jsdelivr.net/npm/mathjax@2.
+# MathJax from https://www.mathjax.org before deployment. The default value is:
+# - in case of MathJax version 2: https://cdn.jsdelivr.net/npm/mathjax@2
+# - in case of MathJax version 3: https://cdn.jsdelivr.net/npm/mathjax@3
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
 MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
 
 # The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
 # extension names that should be enabled during MathJax rendering. For example
+# for MathJax version 2 (see
+# https://docs.mathjax.org/en/v2.7-latest/tex.html#tex-and-latex-extensions):
 # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# For example for MathJax version 3 (see
+# http://docs.mathjax.org/en/latest/input/tex/extensions/index.html):
+# MATHJAX_EXTENSIONS = ams
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
 MATHJAX_EXTENSIONS     =
@@ -1860,29 +2060,31 @@ PAPER_TYPE             = a4
 
 EXTRA_PACKAGES         =
 
-# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
-# generated LaTeX document. The header should contain everything until the first
-# chapter. If it is left blank doxygen will generate a standard header. See
-# section "Doxygen usage" for information on how to let doxygen write the
-# default header to a separate file.
+# The LATEX_HEADER tag can be used to specify a user-defined LaTeX header for
+# the generated LaTeX document. The header should contain everything until the
+# first chapter. If it is left blank doxygen will generate a standard header. It
+# is highly recommended to start with a default header using
+# doxygen -w latex new_header.tex new_footer.tex new_stylesheet.sty
+# and then modify the file new_header.tex. See also section "Doxygen usage" for
+# information on how to generate the default header that doxygen normally uses.
 #
-# Note: Only use a user-defined header if you know what you are doing! The
-# following commands have a special meaning inside the header: $title,
-# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
-# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
-# string, for the replacement values of the other commands the user is referred
-# to HTML_HEADER.
+# Note: Only use a user-defined header if you know what you are doing!
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. The following
+# commands have a special meaning inside the header (and footer): For a
+# description of the possible markers and block names see the documentation.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_HEADER           =
 
-# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
-# generated LaTeX document. The footer should contain everything after the last
-# chapter. If it is left blank doxygen will generate a standard footer. See
+# The LATEX_FOOTER tag can be used to specify a user-defined LaTeX footer for
+# the generated LaTeX document. The footer should contain everything after the
+# last chapter. If it is left blank doxygen will generate a standard footer. See
 # LATEX_HEADER for more information on how to generate a default footer and what
-# special commands can be used inside the footer.
-#
-# Note: Only use a user-defined footer if you know what you are doing!
+# special commands can be used inside the footer. See also section "Doxygen
+# usage" for information on how to generate the default footer that doxygen
+# normally uses. Note: Only use a user-defined footer if you know what you are
+# doing!
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_FOOTER           =
@@ -1925,10 +2127,16 @@ PDF_HYPERLINKS         = YES
 
 USE_PDFLATEX           = YES
 
-# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
-# command to the generated LaTeX files. This will instruct LaTeX to keep running
-# if errors occur, instead of asking the user for help. This option is also used
-# when generating formulas in HTML.
+# The LATEX_BATCHMODE tag signals the behavior of LaTeX in case of an error.
+# Possible values are: NO same as ERROR_STOP, YES same as BATCH, BATCH In batch
+# mode nothing is printed on the terminal, errors are scrolled as if <return> is
+# hit at every error; missing files that TeX tries to input or request from
+# keyboard input (\read on a not open input stream) cause the job to abort,
+# NON_STOP In nonstop mode the diagnostic message will appear on the terminal,
+# but there is no possibility of user interaction just like in batch mode,
+# SCROLL In scroll mode, TeX will stop only for missing files to input or if
+# keyboard input is necessary and ERROR_STOP In errorstop mode, TeX will stop at
+# each error, asking for user intervention.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
@@ -1941,16 +2149,6 @@ LATEX_BATCHMODE        = NO
 
 LATEX_HIDE_INDICES     = NO
 
-# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
-# code with syntax highlighting in the LaTeX output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_SOURCE_CODE      = NO
-
 # The LATEX_BIB_STYLE tag can be used to specify the style to use for the
 # bibliography, e.g. plainnat, or ieeetr. See
 # https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
@@ -1959,14 +2157,6 @@ LATEX_SOURCE_CODE      = NO
 
 LATEX_BIB_STYLE        = plain
 
-# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
-# page will contain the date and time when the page was generated. Setting this
-# to NO can help when comparing the output of multiple runs.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_TIMESTAMP        = NO
-
 # The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
 # path from which the emoji images will be read. If a relative path is entered,
 # it will be relative to the LATEX_OUTPUT directory. If left blank the
@@ -2031,15 +2221,13 @@ RTF_STYLESHEET_FILE    =
 
 RTF_EXTENSIONS_FILE    =
 
-# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
-# with syntax highlighting in the RTF output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
+# The RTF_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the RTF_OUTPUT output directory.
+# Note that the files will be copied as-is; there are no commands or markers
+# available.
 # This tag requires that the tag GENERATE_RTF is set to YES.
 
-RTF_SOURCE_CODE        = NO
+RTF_EXTRA_FILES        =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the man page output
@@ -2137,21 +2325,12 @@ GENERATE_DOCBOOK       = NO
 
 DOCBOOK_OUTPUT         = docbook
 
-# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
-# program listings (including syntax highlighting and cross-referencing
-# information) to the DOCBOOK output. Note that enabling this will significantly
-# increase the size of the DOCBOOK output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_PROGRAMLISTING = NO
-
 #---------------------------------------------------------------------------
 # Configuration options for the AutoGen Definitions output
 #---------------------------------------------------------------------------
 
 # If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
-# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# AutoGen Definitions (see https://autogen.sourceforge.net/) file that captures
 # the structure of the code including all documentation. Note that this feature
 # is still experimental and incomplete at the moment.
 # The default value is: NO.
@@ -2162,6 +2341,28 @@ GENERATE_AUTOGEN_DEF   = NO
 # Configuration options related to Sqlite3 output
 #---------------------------------------------------------------------------
 
+# If the GENERATE_SQLITE3 tag is set to YES doxygen will generate a Sqlite3
+# database with symbols found by doxygen stored in tables.
+# The default value is: NO.
+
+GENERATE_SQLITE3       = NO
+
+# The SQLITE3_OUTPUT tag is used to specify where the Sqlite3 database will be
+# put. If a relative path is entered the value of OUTPUT_DIRECTORY will be put
+# in front of it.
+# The default directory is: sqlite3.
+# This tag requires that the tag GENERATE_SQLITE3 is set to YES.
+
+SQLITE3_OUTPUT         = sqlite3
+
+# The SQLITE3_RECREATE_DB tag is set to YES, the existing doxygen_sqlite3.db
+# database file will be recreated with each doxygen run. If set to NO, doxygen
+# will warn if a database file is already found and not modify it.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_SQLITE3 is set to YES.
+
+SQLITE3_RECREATE_DB    = YES
+
 #---------------------------------------------------------------------------
 # Configuration options related to the Perl module output
 #---------------------------------------------------------------------------
@@ -2236,7 +2437,8 @@ SEARCH_INCLUDES        = YES
 
 # The INCLUDE_PATH tag can be used to specify one or more directories that
 # contain include files that are not input files but should be processed by the
-# preprocessor.
+# preprocessor. Note that the INCLUDE_PATH is not recursive, so the setting of
+# RECURSIVE has no effect here.
 # This tag requires that the tag SEARCH_INCLUDES is set to YES.
 
 INCLUDE_PATH           =
@@ -2303,15 +2505,15 @@ TAGFILES               =
 
 GENERATE_TAGFILE       =
 
-# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
-# the class index. If set to NO, only the inherited external classes will be
-# listed.
+# If the ALLEXTERNALS tag is set to YES, all external classes and namespaces
+# will be listed in the class and namespace index. If set to NO, only the
+# inherited external classes will be listed.
 # The default value is: NO.
 
 ALLEXTERNALS           = NO
 
 # If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
-# in the modules index. If set to NO, only the current project's groups will be
+# in the topic index. If set to NO, only the current project's groups will be
 # listed.
 # The default value is: YES.
 
@@ -2325,25 +2527,9 @@ EXTERNAL_GROUPS        = YES
 EXTERNAL_PAGES         = YES
 
 #---------------------------------------------------------------------------
-# Configuration options related to the dot tool
+# Configuration options related to diagram generator tools
 #---------------------------------------------------------------------------
 
-# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
-# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
-# NO turns the diagrams off. Note that this option also works with HAVE_DOT
-# disabled, but it is recommended to install and use dot, since it yields more
-# powerful graphs.
-# The default value is: YES.
-
-CLASS_DIAGRAMS         = YES
-
-# You can include diagrams made with dia in doxygen documentation. Doxygen will
-# then run dia to produce the diagram and insert it in the documentation. The
-# DIA_PATH tag allows you to specify the directory where the dia binary resides.
-# If left empty dia is assumed to be found in the default search path.
-
-DIA_PATH               =
-
 # If set to YES the inheritance and collaboration graphs will hide inheritance
 # and usage relations if the target is undocumented or is not a class.
 # The default value is: YES.
@@ -2352,7 +2538,7 @@ HIDE_UNDOC_RELATIONS   = YES
 
 # If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
 # available from the path. This tool is part of Graphviz (see:
-# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# https://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
 # Bell Labs. The other options in this section have no effect if this option is
 # set to NO
 # The default value is: NO.
@@ -2369,49 +2555,77 @@ HAVE_DOT               = NO
 
 DOT_NUM_THREADS        = 0
 
-# When you want a differently looking font in the dot files that doxygen
-# generates you can specify the font name using DOT_FONTNAME. You need to make
-# sure dot is able to find the font, which can be done by putting it in a
-# standard location or by setting the DOTFONTPATH environment variable or by
-# setting DOT_FONTPATH to the directory containing the font.
-# The default value is: Helvetica.
+# DOT_COMMON_ATTR is common attributes for nodes, edges and labels of
+# subgraphs. When you want a differently looking font in the dot files that
+# doxygen generates you can specify fontname, fontcolor and fontsize attributes.
+# For details please see <a href=https://graphviz.org/doc/info/attrs.html>Node,
+# Edge and Graph Attributes specification</a> You need to make sure dot is able
+# to find the font, which can be done by putting it in a standard location or by
+# setting the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the
+# directory containing the font. Default graphviz fontsize is 14.
+# The default value is: fontname=Helvetica,fontsize=10.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOT_FONTNAME           = Helvetica
+DOT_COMMON_ATTR        = "fontname=Helvetica,fontsize=10"
 
-# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
-# dot graphs.
-# Minimum value: 4, maximum value: 24, default value: 10.
+# DOT_EDGE_ATTR is concatenated with DOT_COMMON_ATTR. For elegant style you can
+# add 'arrowhead=open, arrowtail=open, arrowsize=0.5'. <a
+# href=https://graphviz.org/doc/info/arrows.html>Complete documentation about
+# arrows shapes.</a>
+# The default value is: labelfontname=Helvetica,labelfontsize=10.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOT_FONTSIZE           = 10
+DOT_EDGE_ATTR          = "labelfontname=Helvetica,labelfontsize=10"
 
-# By default doxygen will tell dot to use the default font as specified with
-# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
-# the path where dot can find it using this tag.
+# DOT_NODE_ATTR is concatenated with DOT_COMMON_ATTR. For view without boxes
+# around nodes set 'shape=plain' or 'shape=plaintext' <a
+# href=https://www.graphviz.org/doc/info/shapes.html>Shapes specification</a>
+# The default value is: shape=box,height=0.2,width=0.4.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NODE_ATTR          = "shape=box,height=0.2,width=0.4"
+
+# You can set the path where dot can find font specified with fontname in
+# DOT_COMMON_ATTR and others dot attributes.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
 DOT_FONTPATH           =
 
-# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
-# each documented class showing the direct and indirect inheritance relations.
-# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# If the CLASS_GRAPH tag is set to YES or GRAPH or BUILTIN then doxygen will
+# generate a graph for each documented class showing the direct and indirect
+# inheritance relations. In case the CLASS_GRAPH tag is set to YES or GRAPH and
+# HAVE_DOT is enabled as well, then dot will be used to draw the graph. In case
+# the CLASS_GRAPH tag is set to YES and HAVE_DOT is disabled or if the
+# CLASS_GRAPH tag is set to BUILTIN, then the built-in generator will be used.
+# If the CLASS_GRAPH tag is set to TEXT the direct and indirect inheritance
+# relations will be shown as texts / links. Explicit enabling an inheritance
+# graph or choosing a different representation for an inheritance graph of a
+# specific class, can be accomplished by means of the command \inheritancegraph.
+# Disabling an inheritance graph can be accomplished by means of the command
+# \hideinheritancegraph.
+# Possible values are: NO, YES, TEXT, GRAPH and BUILTIN.
 # The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
 
 CLASS_GRAPH            = YES
 
 # If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
 # graph for each documented class showing the direct and indirect implementation
 # dependencies (inheritance, containment, and class references variables) of the
-# class with other documented classes.
+# class with other documented classes. Explicit enabling a collaboration graph,
+# when COLLABORATION_GRAPH is set to NO, can be accomplished by means of the
+# command \collaborationgraph. Disabling a collaboration graph can be
+# accomplished by means of the command \hidecollaborationgraph.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
 COLLABORATION_GRAPH    = YES
 
 # If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
-# groups, showing the direct groups dependencies.
+# groups, showing the direct groups dependencies. Explicit enabling a group
+# dependency graph, when GROUP_GRAPHS is set to NO, can be accomplished by means
+# of the command \groupgraph. Disabling a directory graph can be accomplished by
+# means of the command \hidegroupgraph. See also the chapter Grouping in the
+# manual.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
@@ -2453,8 +2667,8 @@ DOT_UML_DETAILS        = NO
 
 # The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters
 # to display on a single line. If the actual line length exceeds this threshold
-# significantly it will wrapped across multiple lines. Some heuristics are apply
-# to avoid ugly line breaks.
+# significantly it will be wrapped across multiple lines. Some heuristics are
+# applied to avoid ugly line breaks.
 # Minimum value: 0, maximum value: 1000, default value: 17.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
@@ -2471,7 +2685,9 @@ TEMPLATE_RELATIONS     = NO
 # If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
 # YES then doxygen will generate a graph for each documented file showing the
 # direct and indirect include dependencies of the file with other documented
-# files.
+# files. Explicit enabling an include graph, when INCLUDE_GRAPH is is set to NO,
+# can be accomplished by means of the command \includegraph. Disabling an
+# include graph can be accomplished by means of the command \hideincludegraph.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
@@ -2480,7 +2696,10 @@ INCLUDE_GRAPH          = YES
 # If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
 # set to YES then doxygen will generate a graph for each documented file showing
 # the direct and indirect include dependencies of the file with other documented
-# files.
+# files. Explicit enabling an included by graph, when INCLUDED_BY_GRAPH is set
+# to NO, can be accomplished by means of the command \includedbygraph. Disabling
+# an included by graph can be accomplished by means of the command
+# \hideincludedbygraph.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
@@ -2520,16 +2739,26 @@ GRAPHICAL_HIERARCHY    = YES
 # If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
 # dependencies a directory has on other directories in a graphical way. The
 # dependency relations are determined by the #include relations between the
-# files in the directories.
+# files in the directories. Explicit enabling a directory graph, when
+# DIRECTORY_GRAPH is set to NO, can be accomplished by means of the command
+# \directorygraph. Disabling a directory graph can be accomplished by means of
+# the command \hidedirectorygraph.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
 DIRECTORY_GRAPH        = YES
 
+# The DIR_GRAPH_MAX_DEPTH tag can be used to limit the maximum number of levels
+# of child directories generated in directory dependency graphs by dot.
+# Minimum value: 1, maximum value: 25, default value: 1.
+# This tag requires that the tag DIRECTORY_GRAPH is set to YES.
+
+DIR_GRAPH_MAX_DEPTH    = 1
+
 # The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
 # generated by dot. For an explanation of the image formats see the section
 # output formats in the documentation of the dot tool (Graphviz (see:
-# http://www.graphviz.org/)).
+# https://www.graphviz.org/)).
 # Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
 # to make the SVG files visible in IE 9+ (other browsers do not have this
 # requirement).
@@ -2566,11 +2795,12 @@ DOT_PATH               =
 
 DOTFILE_DIRS           =
 
-# The MSCFILE_DIRS tag can be used to specify one or more directories that
-# contain msc files that are included in the documentation (see the \mscfile
-# command).
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
 
-MSCFILE_DIRS           =
+DIA_PATH               =
 
 # The DIAFILE_DIRS tag can be used to specify one or more directories that
 # contain dia files that are included in the documentation (see the \diafile
@@ -2579,10 +2809,10 @@ MSCFILE_DIRS           =
 DIAFILE_DIRS           =
 
 # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
-# path where java can find the plantuml.jar file. If left blank, it is assumed
-# PlantUML is not used or called during a preprocessing step. Doxygen will
-# generate a warning when it encounters a \startuml command in this case and
-# will not generate output for the diagram.
+# path where java can find the plantuml.jar file or to the filename of jar file
+# to be used. If left blank, it is assumed PlantUML is not used or called during
+# a preprocessing step. Doxygen will generate a warning when it encounters a
+# \startuml command in this case and will not generate output for the diagram.
 
 PLANTUML_JAR_PATH      =
 
@@ -2599,7 +2829,7 @@ PLANTUML_INCLUDE_PATH  =
 # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
 # that will be shown in the graph. If the number of nodes in a graph becomes
 # larger than this value, doxygen will truncate the graph, which is visualized
-# by representing a node as a red box. Note that doxygen if the number of direct
+# by representing a node as a red box. Note that if the number of direct
 # children of the root node in a graph is already larger than
 # DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
 # the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
@@ -2620,18 +2850,6 @@ DOT_GRAPH_MAX_NODES    = 50
 
 MAX_DOT_GRAPH_DEPTH    = 0
 
-# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, because dot on Windows does not seem
-# to support this out of the box.
-#
-# Warning: Depending on the platform used, enabling this option may lead to
-# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
-# read).
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_TRANSPARENT        = NO
-
 # Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
 # files in one run (i.e. multiple -o and -T options on the command line). This
 # makes dot run faster, but since only newer versions of dot (>1.8.10) support
@@ -2644,6 +2862,8 @@ DOT_MULTI_TARGETS      = NO
 # If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
 # explaining the meaning of the various boxes and arrows in the dot generated
 # graphs.
+# Note: This tag requires that UML_LOOK isn't set, i.e. the doxygen internal
+# graphical representation for inheritance and collaboration diagrams is used.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
@@ -2652,8 +2872,24 @@ GENERATE_LEGEND        = YES
 # If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate
 # files that are used to generate the various graphs.
 #
-# Note: This setting is not only used for dot files but also for msc and
-# plantuml temporary files.
+# Note: This setting is not only used for dot files but also for msc temporary
+# files.
 # The default value is: YES.
 
 DOT_CLEANUP            = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. If the MSCGEN_TOOL tag is left empty (the default), then doxygen will
+# use a built-in version of mscgen tool to produce the charts. Alternatively,
+# the MSCGEN_TOOL tag can also specify the name an external tool. For instance,
+# specifying prog as the value, doxygen will call the tool as prog -T
+# <outfile_format> -o <outputfile> <inputfile>. The external tool should support
+# output file formats "png", "eps", "svg", and "ismap".
+
+MSCGEN_TOOL            =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
diff --git a/third-party/mimalloc/doc/mimalloc-doc.h b/third-party/mimalloc/doc/mimalloc-doc.h
index d79eb2f866..753c062f4a 100644
--- a/third-party/mimalloc/doc/mimalloc-doc.h
+++ b/third-party/mimalloc/doc/mimalloc-doc.h
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -25,12 +25,15 @@ without code changes, for example, on Unix you can use it as:
 ```
 
 Notable aspects of the design include:
-
 - __small and consistent__: the library is about 8k LOC using simple and
   consistent data structures. This makes it very suitable
   to integrate and adapt in other projects. For runtime systems it
   provides hooks for a monotonic _heartbeat_ and deferred freeing (for
   bounded worst-case times with reference counting).
+  Partly due to its simplicity, mimalloc has been ported to many systems (Windows, macOS,
+  Linux, WASM, various BSD's, Haiku, MUSL, etc) and has excellent support for dynamic overriding.
+  At the same time, it is an industrial strength allocator that runs (very) large scale
+  distributed services on thousands of machines with excellent worst case latencies.
 - __free list sharding__: instead of one big free list (per size class) we have
   many smaller lists per "mimalloc page" which reduces fragmentation and
   increases locality --
@@ -45,23 +48,23 @@ Notable aspects of the design include:
   and the chance of contending on a single location will be low -- this is quite
   similar to randomized algorithms like skip lists where adding
   a random oracle removes the need for a more complex algorithm.
-- __eager page reset__: when a "page" becomes empty (with increased chance
-  due to free list sharding) the memory is marked to the OS as unused ("reset" or "purged")
+- __eager page purging__: when a "page" becomes empty (with increased chance
+  due to free list sharding) the memory is marked to the OS as unused (reset or decommitted)
   reducing (real) memory pressure and fragmentation, especially in long running
   programs.
-- __secure__: _mimalloc_ can be build in secure mode, adding guard pages,
+- __secure__: _mimalloc_ can be built in secure mode, adding guard pages,
   randomized allocation, encrypted free lists, etc. to protect against various
-  heap vulnerabilities. The performance penalty is only around 5% on average
+  heap vulnerabilities. The performance penalty is usually around 10% on average
   over our benchmarks.
 - __first-class heaps__: efficiently create and use multiple heaps to allocate across different regions.
   A heap can be destroyed at once instead of deallocating each object separately.
 - __bounded__: it does not suffer from _blowup_ \[1\], has bounded worst-case allocation
-  times (_wcat_), bounded space overhead (~0.2% meta-data, with low internal fragmentation),
-  and has no internal points of contention using only atomic operations.
-- __fast__: In our benchmarks (see [below](#performance)),
-  _mimalloc_ outperforms all other leading allocators (_jemalloc_, _tcmalloc_, _Hoard_, etc),
-  and usually uses less memory (up to 25% more in the worst case). A nice property
-  is that it does consistently well over a wide range of benchmarks.
+  times (_wcat_) (upto OS primitives), bounded space overhead (~0.2% meta-data, with low
+  internal fragmentation), and has no internal points of contention using only atomic operations.
+- __fast__: In our benchmarks (see [below](#bench)),
+  _mimalloc_ outperforms other leading allocators (_jemalloc_, _tcmalloc_, _Hoard_, etc),
+  and often uses less memory. A nice property is that it does consistently well over a wide range
+  of benchmarks. There is also good huge OS page support for larger server programs.
 
 You can read more on the design of _mimalloc_ in the
 [technical report](https://www.microsoft.com/en-us/research/publication/mimalloc-free-list-sharding-in-action)
@@ -278,8 +281,7 @@ void* mi_zalloc_small(size_t size);
 /// The returned size can be
 /// used to call \a mi_expand successfully.
 /// The returned size is always at least equal to the
-/// allocated size of \a p, and, in the current design,
-/// should be less than 16.7% more.
+/// allocated size of \a p.
 ///
 /// @see [_msize](https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/msize?view=vs-2017) (Windows)
 /// @see [malloc_usable_size](http://man7.org/linux/man-pages/man3/malloc_usable_size.3.html) (Linux)
@@ -304,7 +306,7 @@ size_t mi_good_size(size_t size);
 /// in very narrow circumstances; in particular, when a long running thread
 /// allocates a lot of blocks that are freed by other threads it may improve
 /// resource usage by calling this every once in a while.
-void   mi_collect(bool force);
+void mi_collect(bool force);
 
 /// Deprecated
 /// @param out Ignored, outputs to the registered output function or stderr by default.
@@ -428,7 +430,7 @@ int  mi_reserve_os_memory(size_t size, bool commit, bool allow_large);
 /// allocated in some manner and available for use my mimalloc.
 /// @param start       Start of the memory area
 /// @param size        The size of the memory area.
-/// @param commit      Is the area already committed?
+/// @param is_committed Is the area already committed?
 /// @param is_large    Does it consist of large OS pages? Set this to \a true as well for memory
 ///                    that should not be decommitted or protected (like rdma etc.)
 /// @param is_zero     Does the area consists of zero's?
@@ -453,7 +455,7 @@ int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t
 /// Reserve \a pages of huge OS pages (1GiB) at a specific \a numa_node,
 /// but stops after at most `timeout_msecs` seconds.
 /// @param pages The number of 1GiB pages to reserve.
-/// @param numa_node The NUMA node where the memory is reserved (start at 0).
+/// @param numa_node The NUMA node where the memory is reserved (start at 0). Use -1 for no affinity.
 /// @param timeout_msecs Maximum number of milli-seconds to try reserving, or 0 for no timeout.
 /// @returns 0 if successful, \a ENOMEM if running out of memory, or \a ETIMEDOUT if timed out.
 ///
@@ -486,6 +488,91 @@ bool mi_is_redirected();
 /// on other systems as the amount of read/write accessible memory reserved by mimalloc.
 void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults);
 
+/// @brief Show all current arena's.
+/// @param show_inuse       Show the arena blocks that are in use.
+/// @param show_abandoned   Show the abandoned arena blocks.
+/// @param show_purge       Show arena blocks scheduled for purging.
+void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge);
+
+/// Mimalloc uses large (virtual) memory areas, called "arena"s, from the OS to manage its memory.
+/// Each arena has an associated identifier.
+typedef int mi_arena_id_t;
+
+/// @brief  Return the size of an arena.
+/// @param arena_id  The arena identifier.
+/// @param size      Returned size in bytes of the (virtual) arena area.
+/// @return base address of the arena.
+void* mi_arena_area(mi_arena_id_t arena_id, size_t* size);
+
+/// @brief Reserve huge OS pages (1GiB) into a single arena.
+/// @param pages             Number of 1GiB pages to reserve.
+/// @param numa_node         The associated NUMA node, or -1 for no NUMA preference.
+/// @param timeout_msecs     Max amount of milli-seconds this operation is allowed to take. (0 is infinite)
+/// @param exclusive         If exclusive, only a heap associated with this arena can allocate in it.
+/// @param arena_id          The arena identifier.
+/// @return 0 if successful, \a ENOMEM if running out of memory, or \a ETIMEDOUT if timed out.
+int   mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id);
+
+/// @brief Reserve OS memory to be managed in an arena.
+/// @param size Size the reserve.
+/// @param commit Should the memory be initially committed?
+/// @param allow_large Allow the use of large OS pages?
+/// @param exclusive  Is the returned arena exclusive?
+/// @param arena_id The new arena identifier.
+/// @return Zero on success, an error code otherwise.
+int   mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id);
+
+/// @brief Manage externally allocated memory as a mimalloc arena. This memory will not be freed by mimalloc.
+/// @param start Start address of the area.
+/// @param size  Size in bytes of the area.
+/// @param is_committed  Is the memory already committed?
+/// @param is_large      Does it consist of (pinned) large OS pages?
+/// @param is_zero       Is the memory zero-initialized?
+/// @param numa_node     Associated NUMA node, or -1 to have no NUMA preference.
+/// @param exclusive     Is the arena exclusive (where only heaps associated with the arena can allocate in it)
+/// @param arena_id      The new arena identifier.
+/// @return `true` if successful.
+bool  mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id);
+
+/// @brief Create a new heap that only allocates in the specified arena.
+/// @param arena_id The arena identifier.
+/// @return The new heap or `NULL`.
+mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id);
+
+/// @brief Create a new heap
+/// @param heap_tag       The heap tag associated with this heap; heaps only reclaim memory between heaps with the same tag.
+/// @param allow_destroy  Is \a mi_heap_destroy allowed?  Not allowing this allows the heap to reclaim memory from terminated threads.
+/// @param arena_id       If not 0, the heap will only allocate from the specified arena.
+/// @return A new heap or `NULL` on failure.
+///
+/// The \a arena_id can be used by runtimes to allocate only in a specified pre-reserved arena.
+/// This is used for example for a compressed pointer heap in Koka.
+/// The \a heap_tag enables heaps to keep objects of a certain type isolated to heaps with that tag.
+/// This is used for example in the CPython integration.
+mi_heap_t* mi_heap_new_ex(int heap_tag, bool allow_destroy, mi_arena_id_t arena_id);
+
+/// A process can associate threads with sub-processes.
+/// A sub-process will not reclaim memory from (abandoned heaps/threads)
+/// other subprocesses.
+typedef void* mi_subproc_id_t;
+
+/// @brief  Get the main sub-process identifier.
+mi_subproc_id_t mi_subproc_main(void);
+
+/// @brief Create a fresh sub-process (with no associated threads yet).
+/// @return The new sub-process identifier.
+mi_subproc_id_t mi_subproc_new(void);
+
+/// @brief Delete a previously created sub-process.
+/// @param subproc The sub-process identifier.
+/// Only delete sub-processes if all associated threads have terminated.
+void mi_subproc_delete(mi_subproc_id_t subproc);
+
+/// Add the current thread to the given sub-process.
+/// This should be called right after a thread is created (and no allocation has taken place yet)
+void mi_subproc_add_current_thread(mi_subproc_id_t subproc);
+
+
 /// \}
 
 // ------------------------------------------------------
@@ -495,20 +582,24 @@ void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_m
 /// \defgroup aligned Aligned Allocation
 ///
 /// Allocating aligned memory blocks.
+/// Note that `alignment` always follows `size` for consistency with the unaligned
+/// allocation API, but unfortunately this differs from `posix_memalign` and `aligned_alloc` in the C library.
 ///
 /// \{
 
-/// The maximum supported alignment size (currently 1MiB).
-#define MI_BLOCK_ALIGNMENT_MAX   (1024*1024UL)
-
 /// Allocate \a size bytes aligned by \a alignment.
 /// @param size  number of bytes to allocate.
-/// @param alignment  the minimal alignment of the allocated memory. Must be less than #MI_BLOCK_ALIGNMENT_MAX.
-/// @returns pointer to the allocated memory or \a NULL if out of memory.
-/// The returned pointer is aligned by \a alignment, i.e.
-/// `(uintptr_t)p % alignment == 0`.
-///
+/// @param alignment  the minimal alignment of the allocated memory.
+/// @returns pointer to the allocated memory or \a NULL if out of memory,
+/// or if the alignment is not a power of 2 (including 0). The \a size is unrestricted
+/// (and does not have to be an integral multiple of the \a alignment).
+/// The returned pointer is aligned by \a alignment, i.e. `(uintptr_t)p % alignment == 0`.
 /// Returns a unique pointer if called with \a size 0.
+///
+/// Note that `alignment` always follows `size` for consistency with the unaligned
+/// allocation API, but unfortunately this differs from `posix_memalign` and `aligned_alloc` in the C library.
+///
+/// @see [aligned_alloc](https://en.cppreference.com/w/c/memory/aligned_alloc) (in the standard C11 library, with switched arguments!)
 /// @see [_aligned_malloc](https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-malloc?view=vs-2017) (on Windows)
 /// @see [aligned_alloc](http://man.openbsd.org/reallocarray) (on BSD, with switched arguments!)
 /// @see [posix_memalign](https://linux.die.net/man/3/posix_memalign) (on Posix, with switched arguments!)
@@ -522,11 +613,12 @@ void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment);
 /// @param size  number of bytes to allocate.
 /// @param alignment  the minimal alignment of the allocated memory at \a offset.
 /// @param offset     the offset that should be aligned.
-/// @returns pointer to the allocated memory or \a NULL if out of memory.
-/// The returned pointer is aligned by \a alignment at \a offset, i.e.
-/// `((uintptr_t)p + offset) % alignment == 0`.
-///
+/// @returns pointer to the allocated memory or \a NULL if out of memory,
+/// or if the alignment is not a power of 2 (including 0). The \a size is unrestricted
+/// (and does not have to be an integral multiple of the \a alignment).
+/// The returned pointer is aligned by \a alignment, i.e. `(uintptr_t)p % alignment == 0`.
 /// Returns a unique pointer if called with \a size 0.
+///
 /// @see [_aligned_offset_malloc](https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-offset-malloc?view=vs-2017) (on Windows)
 void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset);
 void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset);
@@ -574,12 +666,12 @@ void mi_heap_delete(mi_heap_t* heap);
 /// heap is set to the backing heap.
 void mi_heap_destroy(mi_heap_t* heap);
 
-/// Set the default heap to use for mi_malloc() et al.
+/// Set the default heap to use in the current thread for mi_malloc() et al.
 /// @param heap  The new default heap.
 /// @returns The previous default heap.
 mi_heap_t* mi_heap_set_default(mi_heap_t* heap);
 
-/// Get the default heap that is used for mi_malloc() et al.
+/// Get the default heap that is used for mi_malloc() et al. (for the current thread).
 /// @returns The current default heap.
 mi_heap_t* mi_heap_get_default();
 
@@ -764,6 +856,8 @@ typedef struct mi_heap_area_s {
   size_t committed;   ///< current committed bytes of this area
   size_t used;        ///< bytes in use by allocated blocks
   size_t block_size;  ///< size in bytes of one block
+  size_t full_block_size; ///< size in bytes of a full block including padding and metadata.
+  int    heap_tag;    ///< heap tag associated with this area (see \a mi_heap_new_ex)
 } mi_heap_area_t;
 
 /// Visitor function passed to mi_heap_visit_blocks()
@@ -788,6 +882,23 @@ typedef bool (mi_block_visit_fun)(const mi_heap_t* heap, const mi_heap_area_t* a
 /// @returns \a true if all areas and blocks were visited.
 bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_all_blocks, mi_block_visit_fun* visitor, void* arg);
 
+/// @brief Visit all areas and blocks in abandoned heaps.
+/// @param subproc_id The sub-process id associated with the abandoned heaps.
+/// @param heap_tag Visit only abandoned memory with the specified heap tag, use -1 to visit all abandoned memory.
+/// @param visit_blocks If \a true visits all allocated blocks, otherwise
+///                         \a visitor is only called for every heap area.
+/// @param visitor This function is called for every area in the heap
+///                 (with \a block as \a NULL). If \a visit_all_blocks is
+///                 \a true, \a visitor is also called for every allocated
+///                 block in every area (with `block!=NULL`).
+///                 return \a false from this function to stop visiting early.
+/// @param arg extra argument passed to the \a visitor.
+/// @return \a true if all areas and blocks were visited.
+///
+/// Note: requires the option `mi_option_visit_abandoned` to be set
+/// at the start of the program.
+bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg);
+
 /// \}
 
 /// \defgroup options Runtime Options
@@ -799,34 +910,38 @@ bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_all_blocks, mi_block
 /// Runtime options.
 typedef enum mi_option_e {
   // stable options
-  mi_option_show_errors,  ///< Print error messages to `stderr`.
-  mi_option_show_stats,   ///< Print statistics to `stderr` when the program is done.
-  mi_option_verbose,      ///< Print verbose messages to `stderr`.
-
-  // the following options are experimental
-  mi_option_eager_commit, ///< Eagerly commit segments (4MiB) (enabled by default).
-  mi_option_large_os_pages,      ///< Use large OS pages (2MiB in size) if possible
-  mi_option_reserve_huge_os_pages, ///< The number of huge OS pages (1GiB in size) to reserve at the start of the program.
-  mi_option_reserve_huge_os_pages_at, ///< Reserve huge OS pages at node N.
-  mi_option_reserve_os_memory,        ///< Reserve specified amount of OS memory at startup, e.g. "1g" or "512m".
-  mi_option_segment_cache,   ///< The number of segments per thread to keep cached (0).
-  mi_option_page_reset,      ///< Reset page memory after \a mi_option_reset_delay milliseconds when it becomes free.
-  mi_option_abandoned_page_reset, //< Reset free page memory when a thread terminates.
-  mi_option_use_numa_nodes,  ///< Pretend there are at most N NUMA nodes; Use 0 to use the actual detected NUMA nodes at runtime.
-  mi_option_eager_commit_delay,  ///< the first N segments per thread are not eagerly committed (=1).
-  mi_option_os_tag,          ///< OS tag to assign to mimalloc'd memory
-  mi_option_limit_os_alloc,  ///< If set to 1, do not use OS memory for allocation (but only pre-reserved arenas)
-
-  // v1.x specific options
-  mi_option_eager_region_commit, ///< Eagerly commit large (256MiB) memory regions (enabled by default, except on Windows)
-  mi_option_segment_reset,   ///< Experimental
-  mi_option_reset_delay,     ///< Delay in milli-seconds before resetting a page (100ms by default)
-  mi_option_purge_decommits, ///< Experimental
-
-  // v2.x specific options
-  mi_option_allow_purge,  ///< Enable decommitting memory (=on)
-  mi_option_purge_delay,  ///< Decommit page memory after N milli-seconds delay (25ms).
-  mi_option_segment_purge_delay, ///< Decommit large segment memory after N milli-seconds delay (500ms).
+  mi_option_show_errors,  ///< Print error messages.
+  mi_option_show_stats,   ///< Print statistics on termination.
+  mi_option_verbose,      ///< Print verbose messages.
+  mi_option_max_errors,                 ///< issue at most N error messages
+  mi_option_max_warnings,               ///< issue at most N warning messages
+
+  // advanced options
+  mi_option_reserve_huge_os_pages,    ///< reserve N huge OS pages (1GiB pages) at startup
+  mi_option_reserve_huge_os_pages_at, ///< Reserve N huge OS pages at a specific NUMA node N.
+  mi_option_reserve_os_memory,        ///< reserve specified amount of OS memory in an arena at startup (internally, this value is in KiB; use `mi_option_get_size`)
+  mi_option_allow_large_os_pages,     ///< allow large (2 or 4 MiB) OS pages, implies eager commit. If false, also disables THP for the process.
+  mi_option_purge_decommits,          ///< should a memory purge decommit? (=1). Set to 0 to use memory reset on a purge (instead of decommit)
+  mi_option_arena_reserve,            ///< initial memory size for arena reservation (= 1 GiB on 64-bit) (internally, this value is in KiB; use `mi_option_get_size`)
+  mi_option_os_tag,                   ///< tag used for OS logging (macOS only for now) (=100)
+  mi_option_retry_on_oom,             ///< retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries. (only on windows)
+
+  // experimental options
+  mi_option_eager_commit,             ///< eager commit segments? (after `eager_commit_delay` segments) (enabled by default).
+  mi_option_eager_commit_delay,       ///< the first N segments per thread are not eagerly committed (but per page in the segment on demand)
+  mi_option_arena_eager_commit,       ///< eager commit arenas? Use 2 to enable just on overcommit systems (=2)
+  mi_option_abandoned_page_purge,     ///< immediately purge delayed purges on thread termination
+  mi_option_purge_delay,              ///< memory purging is delayed by N milli seconds; use 0 for immediate purging or -1 for no purging at all. (=10)
+  mi_option_use_numa_nodes,           ///< 0 = use all available numa nodes, otherwise use at most N nodes.
+  mi_option_disallow_os_alloc,        ///< 1 = do not use OS memory for allocation (but only programmatically reserved arenas)
+  mi_option_limit_os_alloc,           ///< If set to 1, do not use OS memory for allocation (but only pre-reserved arenas)
+  mi_option_max_segment_reclaim,        ///< max. percentage of the abandoned segments can be reclaimed per try (=10%)
+  mi_option_destroy_on_exit,            ///< if set, release all memory on exit; sometimes used for dynamic unloading but can be unsafe
+  mi_option_arena_purge_mult,           ///< multiplier for `purge_delay` for the purging delay for arenas (=10)
+  mi_option_abandoned_reclaim_on_free,  ///< allow to reclaim an abandoned segment on a free (=1)
+  mi_option_purge_extend_delay,         ///< extend purge delay on each subsequent delay (=1)
+  mi_option_disallow_arena_alloc,       ///< 1 = do not use arena's for allocation (except if using specific arena id's)
+  mi_option_visit_abandoned,            ///< allow visiting heap blocks from abandoned threads (=0)
 
   _mi_option_last
 } mi_option_t;
@@ -838,7 +953,10 @@ void  mi_option_disable(mi_option_t option);
 void  mi_option_set_enabled(mi_option_t option, bool enable);
 void  mi_option_set_enabled_default(mi_option_t option, bool enable);
 
-long  mi_option_get(mi_option_t option);
+long   mi_option_get(mi_option_t option);
+long   mi_option_get_clamp(mi_option_t option, long min, long max);
+size_t mi_option_get_size(mi_option_t option);
+
 void  mi_option_set(mi_option_t option, long value);
 void  mi_option_set_default(mi_option_t option, long value);
 
@@ -852,21 +970,27 @@ void  mi_option_set_default(mi_option_t option, long value);
 ///
 /// \{
 
+/// Just as `free` but also checks if the pointer `p` belongs to our heap.
+void   mi_cfree(void* p);
+void* mi__expand(void* p, size_t newsize);
+
 void*  mi_recalloc(void* p, size_t count, size_t size);
 size_t mi_malloc_size(const void* p);
+size_t mi_malloc_good_size(size_t size);
 size_t mi_malloc_usable_size(const void *p);
 
-/// Just as `free` but also checks if the pointer `p` belongs to our heap.
-void   mi_cfree(void* p);
-
 int mi_posix_memalign(void** p, size_t alignment, size_t size);
 int mi__posix_memalign(void** p, size_t alignment, size_t size);
 void* mi_memalign(size_t alignment, size_t size);
 void* mi_valloc(size_t size);
-
 void* mi_pvalloc(size_t size);
 void* mi_aligned_alloc(size_t alignment, size_t size);
 
+unsigned short* mi_wcsdup(const unsigned short* s);
+unsigned char*  mi_mbsdup(const unsigned char* s);
+int mi_dupenv_s(char** buf, size_t* size, const char* name);
+int mi_wdupenv_s(unsigned short** buf, size_t* size, const unsigned short* name);
+
 /// Correspond s to [reallocarray](https://www.freebsd.org/cgi/man.cgi?query=reallocarray&sektion=3&manpath=freebsd-release-ports)
 /// in FreeBSD.
 void* mi_reallocarray(void* p, size_t count, size_t size);
@@ -874,6 +998,9 @@ void* mi_reallocarray(void* p, size_t count, size_t size);
 /// Corresponds to [reallocarr](https://man.netbsd.org/reallocarr.3) in NetBSD.
 int   mi_reallocarr(void* p, size_t count, size_t size);
 
+void* mi_aligned_recalloc(void* p, size_t newcount, size_t size, size_t alignment);
+void* mi_aligned_offset_recalloc(void* p, size_t newcount, size_t size, size_t alignment, size_t offset);
+
 void mi_free_size(void* p, size_t size);
 void mi_free_size_aligned(void* p, size_t size, size_t alignment);
 void mi_free_aligned(void* p, size_t alignment);
@@ -998,7 +1125,7 @@ mimalloc uses only safe OS calls (`mmap` and `VirtualAlloc`) and can co-exist
 with other allocators linked to the same program.
 If you use `cmake`, you can simply use:
 ```
-find_package(mimalloc 1.0 REQUIRED)
+find_package(mimalloc 2.1 REQUIRED)
 ```
 in your `CMakeLists.txt` to find a locally installed mimalloc. Then use either:
 ```
@@ -1012,7 +1139,7 @@ to link with the static library. See `test\CMakeLists.txt` for an example.
 
 ### C++
 For best performance in C++ programs, it is also recommended to override the
-global `new` and `delete` operators. For convience, mimalloc provides
+global `new` and `delete` operators. For convenience, mimalloc provides
 [`mimalloc-new-delete.h`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-new-delete.h) which does this for you -- just include it in a single(!) source file in your project.
 
 In C++, mimalloc also provides the `mi_stl_allocator` struct which implements the `std::allocator`
@@ -1071,38 +1198,64 @@ See \ref overrides for more info.
 
 /*! \page environment Environment Options
 
-You can set further options either programmatically (using [`mi_option_set`](https://microsoft.github.io/mimalloc/group__options.html)),
-or via environment variables.
+You can set further options either programmatically (using [`mi_option_set`](https://microsoft.github.io/mimalloc/group__options.html)), or via environment variables:
 
 - `MIMALLOC_SHOW_STATS=1`: show statistics when the program terminates.
 - `MIMALLOC_VERBOSE=1`: show verbose messages.
 - `MIMALLOC_SHOW_ERRORS=1`: show error and warning messages.
-- `MIMALLOC_PAGE_RESET=0`: by default, mimalloc will reset (or purge) OS pages when not in use to signal to the OS
-   that the underlying physical memory can be reused. This can reduce memory fragmentation in long running (server)
-   programs. By setting it to `0` no such page resets will be done which can improve performance for programs that are not long
-   running. As an alternative, the `MIMALLOC_DECOMMIT_DELAY=`<msecs> can be set higher (100ms by default) to make the page
-   reset occur less frequently instead of turning it off completely.
-- `MIMALLOC_LARGE_OS_PAGES=1`: use large OS pages (2MiB) when available; for some workloads this can significantly
-   improve performance. Use `MIMALLOC_VERBOSE` to check if the large OS pages are enabled -- usually one needs
-   to explicitly allow large OS pages (as on [Windows][windows-huge] and [Linux][linux-huge]). However, sometimes
+
+Advanced options:
+
+- `MIMALLOC_ARENA_EAGER_COMMIT=2`: turns on eager commit for the large arenas (usually 1GiB) from which mimalloc
+   allocates segments and pages. Set this to 2 (default) to
+   only enable this on overcommit systems (e.g. Linux). Set this to 1 to enable explicitly on other systems
+   as well (like Windows or macOS) which may improve performance (as the whole arena is committed at once).
+   Note that eager commit only increases the commit but not the actual the peak resident set
+   (rss) so it is generally ok to enable this.
+- `MIMALLOC_PURGE_DELAY=N`: the delay in `N` milli-seconds (by default `10`) after which mimalloc will purge
+   OS pages that are not in use. This signals to the OS that the underlying physical memory can be reused which
+   can reduce memory fragmentation especially in long running (server) programs. Setting `N` to `0` purges immediately when
+   a page becomes unused which can improve memory usage but also decreases performance. Setting `N` to a higher
+   value like `100` can improve performance (sometimes by a lot) at the cost of potentially using more memory at times.
+   Setting it to `-1` disables purging completely.
+- `MIMALLOC_PURGE_DECOMMITS=1`: By default "purging" memory means unused memory is decommitted (`MEM_DECOMMIT` on Windows,
+   `MADV_DONTNEED` (which decresease rss immediately) on `mmap` systems). Set this to 0 to instead "reset" unused
+   memory on a purge (`MEM_RESET` on Windows, generally `MADV_FREE` (which does not decrease rss immediately) on `mmap` systems).
+   Mimalloc generally does not "free" OS memory but only "purges" OS memory, in other words, it tries to keep virtual
+   address ranges and decommits within those ranges (to make the underlying physical memory available to other processes).
+
+Further options for large workloads and services:
+
+- `MIMALLOC_USE_NUMA_NODES=N`: pretend there are at most `N` NUMA nodes. If not set, the actual NUMA nodes are detected
+   at runtime. Setting `N` to 1 may avoid problems in some virtual environments. Also, setting it to a lower number than
+   the actual NUMA nodes is fine and will only cause threads to potentially allocate more memory across actual NUMA
+   nodes (but this can happen in any case as NUMA local allocation is always a best effort but not guaranteed).
+- `MIMALLOC_ALLOW_LARGE_OS_PAGES=1`: use large OS pages (2 or 4MiB) when available; for some workloads this can significantly
+   improve performance. When this option is disabled (default), it also disables transparent huge pages (THP) for the process
+   (on Linux and Android). On Linux the default setting is 2 -- this enables the use of large pages through THP only.
+   Use `MIMALLOC_VERBOSE` to check if the large OS pages are enabled -- usually one needs
+   to explicitly give permissions for large OS pages (as on [Windows][windows-huge] and [Linux][linux-huge]). However, sometimes
    the OS is very slow to reserve contiguous physical memory for large OS pages so use with care on systems that
-   can have fragmented memory (for that reason, we generally recommend to use `MIMALLOC_RESERVE_HUGE_OS_PAGES` instead when possible).
-- `MIMALLOC_RESERVE_HUGE_OS_PAGES=N`: where N is the number of 1GiB _huge_ OS pages. This reserves the huge pages at
+   can have fragmented memory (for that reason, we generally recommend to use `MIMALLOC_RESERVE_HUGE_OS_PAGES` instead whenever possible).
+- `MIMALLOC_RESERVE_HUGE_OS_PAGES=N`: where `N` is the number of 1GiB _huge_ OS pages. This reserves the huge pages at
    startup and sometimes this can give a large (latency) performance improvement on big workloads.
-   Usually it is better to not use
-   `MIMALLOC_LARGE_OS_PAGES` in combination with this setting. Just like large OS pages, use with care as reserving
+   Usually it is better to not use `MIMALLOC_ALLOW_LARGE_OS_PAGES=1` in combination with this setting. Just like large
+   OS pages, use with care as reserving
    contiguous physical memory can take a long time when memory is fragmented (but reserving the huge pages is done at
    startup only once).
-   Note that we usually need to explicitly enable huge OS pages (as on [Windows][windows-huge] and [Linux][linux-huge])). With huge OS pages, it may be beneficial to set the setting
+   Note that we usually need to explicitly give permission for huge OS pages (as on [Windows][windows-huge] and [Linux][linux-huge])).
+   With huge OS pages, it may be beneficial to set the setting
    `MIMALLOC_EAGER_COMMIT_DELAY=N` (`N` is 1 by default) to delay the initial `N` segments (of 4MiB)
    of a thread to not allocate in the huge OS pages; this prevents threads that are short lived
-   and allocate just a little to take up space in the huge OS page area (which cannot be reset).
-- `MIMALLOC_RESERVE_HUGE_OS_PAGES_AT=N`: where N is the numa node. This reserves the huge pages at a specific numa node.
-   (`N` is -1 by default to reserve huge pages evenly among the given number of numa nodes (or use the available ones as detected))
+   and allocate just a little to take up space in the huge OS page area (which cannot be purged as huge OS pages are pinned
+   to physical memory).
+   The huge pages are usually allocated evenly among NUMA nodes.
+   We can use `MIMALLOC_RESERVE_HUGE_OS_PAGES_AT=N` where `N` is the numa node (starting at 0) to allocate all
+   the huge pages at a specific numa node instead.
 
 Use caution when using `fork` in combination with either large or huge OS pages: on a fork, the OS uses copy-on-write
 for all pages in the original process including the huge OS pages. When any memory is now written in that area, the
-OS will copy the entire 1GiB huge page (or 2MiB large page) which can cause the memory usage to grow in big increments.
+OS will copy the entire 1GiB huge page (or 2MiB large page) which can cause the memory usage to grow in large increments.
 
 [linux-huge]: https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/5/html/tuning_and_optimizing_red_hat_enterprise_linux_for_oracle_9i_and_10g_databases/sect-oracle_9i_and_10g_tuning_guide-large_memory_optimization_big_pages_and_huge_pages-configuring_huge_pages_in_red_hat_enterprise_linux_4_or_5
 [windows-huge]: https://docs.microsoft.com/en-us/sql/database-engine/configure-windows/enable-the-lock-pages-in-memory-option-windows?view=sql-server-2017
@@ -1111,88 +1264,106 @@ OS will copy the entire 1GiB huge page (or 2MiB large page) which can cause the
 
 /*! \page overrides Overriding Malloc
 
-Overriding the standard `malloc` can be done either _dynamically_ or _statically_.
+Overriding the standard `malloc` (and `new`) can be done either _dynamically_ or _statically_.
 
 ## Dynamic override
 
 This is the recommended way to override the standard malloc interface.
 
+### Dynamic Override on Linux, BSD
 
-### Linux, BSD
-
-On these systems we preload the mimalloc shared
+On these ELF-based systems we preload the mimalloc shared
 library so all calls to the standard `malloc` interface are
 resolved to the _mimalloc_ library.
-
-- `env LD_PRELOAD=/usr/lib/libmimalloc.so myprogram`
+```
+> env LD_PRELOAD=/usr/lib/libmimalloc.so myprogram
+```
 
 You can set extra environment variables to check that mimalloc is running,
 like:
 ```
-env MIMALLOC_VERBOSE=1 LD_PRELOAD=/usr/lib/libmimalloc.so myprogram
+> env MIMALLOC_VERBOSE=1 LD_PRELOAD=/usr/lib/libmimalloc.so myprogram
 ```
 or run with the debug version to get detailed statistics:
 ```
-env MIMALLOC_SHOW_STATS=1 LD_PRELOAD=/usr/lib/libmimalloc-debug.so myprogram
+> env MIMALLOC_SHOW_STATS=1 LD_PRELOAD=/usr/lib/libmimalloc-debug.so myprogram
 ```
 
-### MacOS
+### Dynamic Override on MacOS
 
 On macOS we can also preload the mimalloc shared
 library so all calls to the standard `malloc` interface are
 resolved to the _mimalloc_ library.
-
-- `env DYLD_FORCE_FLAT_NAMESPACE=1 DYLD_INSERT_LIBRARIES=/usr/lib/libmimalloc.dylib myprogram`
+```
+> env DYLD_INSERT_LIBRARIES=/usr/lib/libmimalloc.dylib myprogram
+```
 
 Note that certain security restrictions may apply when doing this from
 the [shell](https://stackoverflow.com/questions/43941322/dyld-insert-libraries-ignored-when-calling-application-through-bash).
 
-(Note: macOS support for dynamic overriding is recent, please report any issues.)
 
+### Dynamic Override on Windows
+
+<span id="override_on_windows">Dynamically overriding on mimalloc on Windows</span> 
+is robust and has the particular advantage to be able to redirect all malloc/free calls 
+that go through the (dynamic) C runtime allocator, including those from other DLL's or 
+libraries. As it intercepts all allocation calls on a low level, it can be used reliably 
+on large programs that include other 3rd party components.
+There are four requirements to make the overriding work well:
+
+1. Use the C-runtime library as a DLL (using the `/MD` or `/MDd` switch).
 
-### Windows
+2. Link your program explicitly with the `mimalloc.lib` export library for the `mimalloc.dll`.
+   (which must be compiled with `-DMI_OVERRIDE=ON`, which is the default though).
+   To ensure the `mimalloc.dll` is actually loaded at run-time it is easiest 
+   to insert some call to the mimalloc API in the `main` function, like `mi_version()`
+   (or use the `/include:mi_version` switch on the linker command, or
+   similarly, `#pragma comment(linker, "/include:mi_version")` in some source file). 
+   See the `mimalloc-test-override` project for an example on how to use this. 
 
-Overriding on Windows is robust and has the
-particular advantage to be able to redirect all malloc/free calls that go through
-the (dynamic) C runtime allocator, including those from other DLL's or libraries.
+3. The `mimalloc-redirect.dll` must be put in the same directory as the main 
+   `mimalloc.dll` at runtime (as it is a dependency of that DLL).
+   The redirection DLL ensures that all calls to the C runtime malloc API get 
+   redirected to mimalloc functions (which reside in `mimalloc.dll`).
 
-The overriding on Windows requires that you link your program explicitly with
-the mimalloc DLL and use the C-runtime library as a DLL (using the `/MD` or `/MDd` switch).
-Also, the `mimalloc-redirect.dll` (or `mimalloc-redirect32.dll`) must be available
-in the same folder as the main `mimalloc-override.dll` at runtime (as it is a dependency).
-The redirection DLL ensures that all calls to the C runtime malloc API get redirected to
-mimalloc (in `mimalloc-override.dll`).
+4. Ensure the `mimalloc.dll` comes as early as possible in the import
+   list of the final executable (so it can intercept all potential allocations).
+   You can use `minject -l <exe>` to check this if needed.
 
-To ensure the mimalloc DLL is loaded at run-time it is easiest to insert some
-call to the mimalloc API in the `main` function, like `mi_version()`
-(or use the `/INCLUDE:mi_version` switch on the linker). See the `mimalloc-override-test` project
-for an example on how to use this. For best performance on Windows with C++, it
+For best performance on Windows with C++, it
 is also recommended to also override the `new`/`delete` operations (by including
-[`mimalloc-new-delete.h`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-new-delete.h) a single(!) source file in your project).
+[`mimalloc-new-delete.h`](include/mimalloc-new-delete.h)
+a single(!) source file in your project).
 
 The environment variable `MIMALLOC_DISABLE_REDIRECT=1` can be used to disable dynamic
-overriding at run-time. Use `MIMALLOC_VERBOSE=1` to check if mimalloc was successfully redirected.
+overriding at run-time. Use `MIMALLOC_VERBOSE=1` to check if mimalloc was successfully 
+redirected.
 
-(Note: in principle, it is possible to even patch existing executables without any recompilation
-if they are linked with the dynamic C runtime (`ucrtbase.dll`) -- just put the `mimalloc-override.dll`
-into the import table (and put `mimalloc-redirect.dll` in the same folder)
-Such patching can be done for example with [CFF Explorer](https://ntcore.com/?page_id=388)).
+For different platforms than x64, you may need a specific [redirection dll](bin).
+Furthermore, we cannot always re-link an executable or ensure `mimalloc.dll` comes
+first in the import table. In such cases the [`minject`](bin) tool can be used
+to patch the executable's import tables.
 
 
 ## Static override
 
-On Unix systems, you can also statically link with _mimalloc_ to override the standard
+On Unix-like systems, you can also statically link with _mimalloc_ to override the standard
 malloc interface. The recommended way is to link the final program with the
-_mimalloc_ single object file (`mimalloc-override.o`). We use
+_mimalloc_ single object file (`mimalloc.o`). We use
 an object file instead of a library file as linkers give preference to
 that over archives to resolve symbols. To ensure that the standard
 malloc interface resolves to the _mimalloc_ library, link it as the first
 object file. For example:
-
 ```
-gcc -o myprogram mimalloc-override.o  myfile1.c ...
+> gcc -o myprogram mimalloc.o  myfile1.c ...
 ```
 
+Another way to override statically that works on all platforms, is to
+link statically to mimalloc (as shown in the introduction) and include a
+header file in each source file that re-defines `malloc` etc. to `mi_malloc`.
+This is provided by [`mimalloc-override.h`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-override.h). This only works reliably though if all sources are
+under your control or otherwise mixing of pointers from different heaps may occur!
+
 ## List of Overrides:
 
 The specific functions that get redirected to the _mimalloc_ library are:
diff --git a/third-party/mimalloc/doc/mimalloc-doxygen.css b/third-party/mimalloc/doc/mimalloc-doxygen.css
index b24f564326..c889a8d2c3 100644
--- a/third-party/mimalloc/doc/mimalloc-doxygen.css
+++ b/third-party/mimalloc/doc/mimalloc-doxygen.css
@@ -47,3 +47,14 @@ div.fragment {
 #nav-sync img {
 	display: none;
 }
+h1,h2,h3,h4,h5,h6 {
+	transition:none;
+}
+.memtitle {
+	background-image: none;
+	background-color: #EEE;
+}
+table.memproto, .memproto {
+	text-shadow: none;
+	font-size: 110%;
+}
diff --git a/third-party/mimalloc/docker/alpine-arm32v7/Dockerfile b/third-party/mimalloc/docker/alpine-arm32v7/Dockerfile
index 56f071db3c..f74934fb3f 100644
--- a/third-party/mimalloc/docker/alpine-arm32v7/Dockerfile
+++ b/third-party/mimalloc/docker/alpine-arm32v7/Dockerfile
@@ -1,10 +1,10 @@
 # install from an image
-# download first an appropiate tar.gz image into the current directory 
+# download first an appropriate tar.gz image into the current directory
 # from: <https://github.com/alpinelinux/docker-alpine/tree/edge/armv7>
 FROM scratch
 
 # Substitute the image name that was downloaded
-ADD alpine-minirootfs-20240329-armv7.tar.gz /    
+ADD alpine-minirootfs-20240329-armv7.tar.gz /
 
 # Install tools
 RUN apk add build-base make cmake
@@ -15,7 +15,7 @@ RUN mkdir -p  /home/dev
 WORKDIR /home/dev
 
 # Get mimalloc
-RUN git clone https://github.com/microsoft/mimalloc -b dev-slice
+RUN git clone https://github.com/microsoft/mimalloc -b dev2
 RUN mkdir -p mimalloc/out/release
 RUN mkdir -p mimalloc/out/debug
 
diff --git a/third-party/mimalloc/docker/alpine/Dockerfile b/third-party/mimalloc/docker/alpine/Dockerfile
index b222b79194..e1234a9b64 100644
--- a/third-party/mimalloc/docker/alpine/Dockerfile
+++ b/third-party/mimalloc/docker/alpine/Dockerfile
@@ -10,7 +10,7 @@ RUN mkdir -p  /home/dev
 WORKDIR /home/dev
 
 # Get mimalloc
-RUN git clone https://github.com/microsoft/mimalloc -b dev-slice
+RUN git clone https://github.com/microsoft/mimalloc -b dev2
 RUN mkdir -p mimalloc/out/release
 RUN mkdir -p mimalloc/out/debug
 
diff --git a/third-party/mimalloc/docker/manylinux-x64/Dockerfile b/third-party/mimalloc/docker/manylinux-x64/Dockerfile
index 22d37e5a72..ff54d674ef 100644
--- a/third-party/mimalloc/docker/manylinux-x64/Dockerfile
+++ b/third-party/mimalloc/docker/manylinux-x64/Dockerfile
@@ -10,7 +10,7 @@ RUN mkdir -p  /home/dev
 WORKDIR /home/dev
 
 # Get mimalloc
-RUN git clone https://github.com/microsoft/mimalloc -b dev-slice
+RUN git clone https://github.com/microsoft/mimalloc -b dev2
 RUN mkdir -p mimalloc/out/release
 RUN mkdir -p mimalloc/out/debug
 
diff --git a/third-party/mimalloc/ide/vs2017/mimalloc-override-test.vcxproj b/third-party/mimalloc/ide/vs2017/mimalloc-override-test.vcxproj
deleted file mode 100644
index 04c16a9faf..0000000000
--- a/third-party/mimalloc/ide/vs2017/mimalloc-override-test.vcxproj
+++ /dev/null
@@ -1,190 +0,0 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <VCProjectVersion>15.0</VCProjectVersion>
-    <ProjectGuid>{FEF7868F-750E-4C21-A04D-22707CC66879}</ProjectGuid>
-    <RootNamespace>mimalloc-override-test</RootNamespace>
-    <ProjectName>mimalloc-override-test</ProjectName>
-    <WindowsTargetPlatformVersion>10.0.19041.0</WindowsTargetPlatformVersion>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v141</PlatformToolset>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v141</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v141</PlatformToolset>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v141</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="Shared">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
-      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
-      <ExceptionHandling>false</ExceptionHandling>
-      <CompileAs>Default</CompileAs>
-      <SupportJustMyCode>false</SupportJustMyCode>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <AdditionalDependencies>kernel32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-    </Link>
-    <PostBuildEvent />
-    <PostBuildEvent>
-      <Command>
-      </Command>
-    </PostBuildEvent>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
-      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
-      <ExceptionHandling>Sync</ExceptionHandling>
-      <CompileAs>Default</CompileAs>
-      <SupportJustMyCode>false</SupportJustMyCode>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <EntryPointSymbol>
-      </EntryPointSymbol>
-      <AdditionalDependencies>kernel32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-    </Link>
-    <PostBuildEvent />
-    <PostBuildEvent>
-      <Command>
-      </Command>
-    </PostBuildEvent>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
-      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
-    </ClCompile>
-    <Link>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <SubSystem>Console</SubSystem>
-      <AdditionalDependencies>kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-    </Link>
-    <PostBuildEvent>
-      <Command>
-      </Command>
-    </PostBuildEvent>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
-      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
-    </ClCompile>
-    <Link>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <SubSystem>Console</SubSystem>
-      <EntryPointSymbol>
-      </EntryPointSymbol>
-      <AdditionalDependencies>kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-    </Link>
-    <PostBuildEvent>
-      <Command>
-      </Command>
-    </PostBuildEvent>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ProjectReference Include="mimalloc-override.vcxproj">
-      <Project>{abb5eae7-b3e6-432e-b636-333449892ea7}</Project>
-    </ProjectReference>
-  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="..\..\test\main-override.cpp" />
-  </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
-</Project>
\ No newline at end of file
diff --git a/third-party/mimalloc/ide/vs2017/mimalloc-override.vcxproj b/third-party/mimalloc/ide/vs2017/mimalloc-override.vcxproj
deleted file mode 100644
index 6d20eb578f..0000000000
--- a/third-party/mimalloc/ide/vs2017/mimalloc-override.vcxproj
+++ /dev/null
@@ -1,260 +0,0 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <VCProjectVersion>15.0</VCProjectVersion>
-    <ProjectGuid>{ABB5EAE7-B3E6-432E-B636-333449892EA7}</ProjectGuid>
-    <RootNamespace>mimalloc-override</RootNamespace>
-    <ProjectName>mimalloc-override</ProjectName>
-    <WindowsTargetPlatformVersion>10.0.19041.0</WindowsTargetPlatformVersion>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v141</PlatformToolset>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v141</PlatformToolset>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v141</PlatformToolset>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v141</PlatformToolset>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="Shared">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-    <TargetExt>.dll</TargetExt>
-    <TargetName>mimalloc-override</TargetName>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-    <TargetExt>.dll</TargetExt>
-    <TargetName>mimalloc-override</TargetName>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-    <TargetExt>.dll</TargetExt>
-    <TargetName>mimalloc-override</TargetName>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-    <TargetExt>.dll</TargetExt>
-    <TargetName>mimalloc-override</TargetName>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);</PreprocessorDefinitions>
-      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
-      <SupportJustMyCode>false</SupportJustMyCode>
-      <CompileAs>Default</CompileAs>
-    </ClCompile>
-    <Link>
-      <AdditionalDependencies>$(ProjectDir)\..\..\bin\mimalloc-redirect32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <IgnoreSpecificDefaultLibraries>
-      </IgnoreSpecificDefaultLibraries>
-      <ModuleDefinitionFile>
-      </ModuleDefinitionFile>
-      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
-      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
-    </Link>
-    <PostBuildEvent>
-      <Command>COPY /Y "$(ProjectDir)..\..\bin\mimalloc-redirect32.dll" "$(OutputPath)"</Command>
-    </PostBuildEvent>
-    <PostBuildEvent>
-      <Message>Copy mimalloc-redirect32.dll to the output directory</Message>
-    </PostBuildEvent>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);</PreprocessorDefinitions>
-      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
-      <SupportJustMyCode>false</SupportJustMyCode>
-      <CompileAs>Default</CompileAs>
-    </ClCompile>
-    <Link>
-      <AdditionalDependencies>$(ProjectDir)\..\..\bin\mimalloc-redirect.lib;bcrypt.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <IgnoreSpecificDefaultLibraries>
-      </IgnoreSpecificDefaultLibraries>
-      <ModuleDefinitionFile>
-      </ModuleDefinitionFile>
-      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
-      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
-    </Link>
-    <PostBuildEvent>
-      <Command>COPY /Y "$(ProjectDir)..\..\bin\mimalloc-redirect.dll" "$(OutputPath)"</Command>
-    </PostBuildEvent>
-    <PostBuildEvent>
-      <Message>copy mimalloc-redirect.dll to the output directory</Message>
-    </PostBuildEvent>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
-      <AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput>
-      <AssemblerListingLocation>$(IntDir)</AssemblerListingLocation>
-      <WholeProgramOptimization>false</WholeProgramOptimization>
-      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
-      <CompileAs>Default</CompileAs>
-      <BufferSecurityCheck>false</BufferSecurityCheck>
-    </ClCompile>
-    <Link>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>$(ProjectDir)\..\..\bin\mimalloc-redirect32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <ModuleDefinitionFile>
-      </ModuleDefinitionFile>
-      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
-      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
-    </Link>
-    <PostBuildEvent>
-      <Command>COPY /Y "$(ProjectDir)..\..\bin\mimalloc-redirect32.dll" "$(OutputPath)"</Command>
-    </PostBuildEvent>
-    <PostBuildEvent>
-      <Message>Copy mimalloc-redirect32.dll to the output directory</Message>
-    </PostBuildEvent>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
-      <AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput>
-      <AssemblerListingLocation>$(IntDir)</AssemblerListingLocation>
-      <WholeProgramOptimization>false</WholeProgramOptimization>
-      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
-      <CompileAs>Default</CompileAs>
-      <BufferSecurityCheck>false</BufferSecurityCheck>
-    </ClCompile>
-    <Link>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>$(ProjectDir)\..\..\bin\mimalloc-redirect.lib;bcrypt.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <ModuleDefinitionFile>
-      </ModuleDefinitionFile>
-      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
-      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
-    </Link>
-    <PostBuildEvent>
-      <Command>COPY /Y "$(ProjectDir)..\..\bin\mimalloc-redirect.dll" "$(OutputPath)"</Command>
-    </PostBuildEvent>
-    <PostBuildEvent>
-      <Message>copy mimalloc-redirect.dll to the output directory</Message>
-    </PostBuildEvent>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc.h" />
-    <ClInclude Include="..\..\include\mimalloc-new-delete.h" />
-    <ClInclude Include="..\..\include\mimalloc-override.h" />
-    <ClInclude Include="..\..\include\mimalloc\atomic.h" />
-    <ClInclude Include="..\..\include\mimalloc\internal.h" />
-    <ClInclude Include="..\..\include\mimalloc\prim.h" />
-    <ClInclude Include="..\..\include\mimalloc\track.h" />
-    <ClInclude Include="..\..\include\mimalloc\types.h" />
-    <ClInclude Include="..\..\src\bitmap.h" />
-  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="..\..\src\alloc-aligned.c">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">false</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
-    </ClCompile>
-    <ClCompile Include="..\..\src\alloc-override.c">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </ClCompile>
-    <ClCompile Include="..\..\src\alloc-posix.c" />
-    <ClCompile Include="..\..\src\alloc.c" />
-    <ClCompile Include="..\..\src\arena.c" />
-    <ClCompile Include="..\..\src\bitmap.c" />
-    <ClCompile Include="..\..\src\heap.c" />
-    <ClCompile Include="..\..\src\init.c" />
-    <ClCompile Include="..\..\src\libc.c" />
-    <ClCompile Include="..\..\src\prim\prim.c" />
-    <ClCompile Include="..\..\src\options.c" />
-    <ClCompile Include="..\..\src\os.c" />
-    <ClCompile Include="..\..\src\page-queue.c">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </ClCompile>
-    <ClCompile Include="..\..\src\page.c" />
-    <ClCompile Include="..\..\src\random.c" />
-    <ClCompile Include="..\..\src\segment-map.c" />
-    <ClCompile Include="..\..\src\segment.c" />
-    <ClCompile Include="..\..\src\stats.c" />
-  </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
-</Project>
\ No newline at end of file
diff --git a/third-party/mimalloc/ide/vs2017/mimalloc-test-stress.vcxproj b/third-party/mimalloc/ide/vs2017/mimalloc-test-stress.vcxproj
deleted file mode 100644
index 061b8605c8..0000000000
--- a/third-party/mimalloc/ide/vs2017/mimalloc-test-stress.vcxproj
+++ /dev/null
@@ -1,159 +0,0 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <VCProjectVersion>15.0</VCProjectVersion>
-    <ProjectGuid>{FEF7958F-750E-4C21-A04D-22707CC66878}</ProjectGuid>
-    <RootNamespace>mimalloc-test-stress</RootNamespace>
-    <ProjectName>mimalloc-test-stress</ProjectName>
-    <WindowsTargetPlatformVersion>10.0.19041.0</WindowsTargetPlatformVersion>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v141</PlatformToolset>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v141</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v141</PlatformToolset>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v141</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="Shared">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
-    </ClCompile>
-    <Link>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <SubSystem>Console</SubSystem>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
-    </ClCompile>
-    <Link>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <SubSystem>Console</SubSystem>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ClCompile Include="..\..\test\test-stress.c">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">false</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
-    </ClCompile>
-  </ItemGroup>
-  <ItemGroup>
-    <ProjectReference Include="mimalloc.vcxproj">
-      <Project>{abb5eae7-b3e6-432e-b636-333449892ea6}</Project>
-    </ProjectReference>
-  </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
-</Project>
\ No newline at end of file
diff --git a/third-party/mimalloc/ide/vs2017/mimalloc-test.vcxproj b/third-party/mimalloc/ide/vs2017/mimalloc-test.vcxproj
deleted file mode 100644
index 04bd6537b4..0000000000
--- a/third-party/mimalloc/ide/vs2017/mimalloc-test.vcxproj
+++ /dev/null
@@ -1,158 +0,0 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <VCProjectVersion>15.0</VCProjectVersion>
-    <ProjectGuid>{FEF7858F-750E-4C21-A04D-22707CC66878}</ProjectGuid>
-    <RootNamespace>mimalloctest</RootNamespace>
-    <ProjectName>mimalloc-test</ProjectName>
-    <WindowsTargetPlatformVersion>10.0.19041.0</WindowsTargetPlatformVersion>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v141</PlatformToolset>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v141</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v141</PlatformToolset>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v141</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="Shared">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
-      <LanguageStandard>stdcpp17</LanguageStandard>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
-      <LanguageStandard>stdcpp14</LanguageStandard>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
-      <LanguageStandard>stdcpp17</LanguageStandard>
-    </ClCompile>
-    <Link>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <SubSystem>Console</SubSystem>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
-      <LanguageStandard>stdcpp17</LanguageStandard>
-    </ClCompile>
-    <Link>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <SubSystem>Console</SubSystem>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ProjectReference Include="mimalloc.vcxproj">
-      <Project>{abb5eae7-b3e6-432e-b636-333449892ea6}</Project>
-    </ProjectReference>
-  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="..\..\test\main-override-static.c" />
-  </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
-</Project>
\ No newline at end of file
diff --git a/third-party/mimalloc/ide/vs2017/mimalloc.sln b/third-party/mimalloc/ide/vs2017/mimalloc.sln
deleted file mode 100644
index 515c03f2e7..0000000000
--- a/third-party/mimalloc/ide/vs2017/mimalloc.sln
+++ /dev/null
@@ -1,71 +0,0 @@
-﻿
-Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio 15
-VisualStudioVersion = 15.0.26228.102
-MinimumVisualStudioVersion = 10.0.40219.1
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc", "mimalloc.vcxproj", "{ABB5EAE7-B3E6-432E-B636-333449892EA6}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc-test", "mimalloc-test.vcxproj", "{FEF7858F-750E-4C21-A04D-22707CC66878}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc-override", "mimalloc-override.vcxproj", "{ABB5EAE7-B3E6-432E-B636-333449892EA7}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc-override-test", "mimalloc-override-test.vcxproj", "{FEF7868F-750E-4C21-A04D-22707CC66879}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc-test-stress", "mimalloc-test-stress.vcxproj", "{FEF7958F-750E-4C21-A04D-22707CC66878}"
-EndProject
-Global
-	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug|x64 = Debug|x64
-		Debug|x86 = Debug|x86
-		Release|x64 = Release|x64
-		Release|x86 = Release|x86
-	EndGlobalSection
-	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Debug|x64.ActiveCfg = Debug|x64
-		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Debug|x64.Build.0 = Debug|x64
-		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Debug|x86.ActiveCfg = Debug|Win32
-		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Debug|x86.Build.0 = Debug|Win32
-		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Release|x64.ActiveCfg = Release|x64
-		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Release|x64.Build.0 = Release|x64
-		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Release|x86.ActiveCfg = Release|Win32
-		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Release|x86.Build.0 = Release|Win32
-		{FEF7858F-750E-4C21-A04D-22707CC66878}.Debug|x64.ActiveCfg = Debug|x64
-		{FEF7858F-750E-4C21-A04D-22707CC66878}.Debug|x64.Build.0 = Debug|x64
-		{FEF7858F-750E-4C21-A04D-22707CC66878}.Debug|x86.ActiveCfg = Debug|Win32
-		{FEF7858F-750E-4C21-A04D-22707CC66878}.Debug|x86.Build.0 = Debug|Win32
-		{FEF7858F-750E-4C21-A04D-22707CC66878}.Release|x64.ActiveCfg = Release|x64
-		{FEF7858F-750E-4C21-A04D-22707CC66878}.Release|x64.Build.0 = Release|x64
-		{FEF7858F-750E-4C21-A04D-22707CC66878}.Release|x86.ActiveCfg = Release|Win32
-		{FEF7858F-750E-4C21-A04D-22707CC66878}.Release|x86.Build.0 = Release|Win32
-		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Debug|x64.ActiveCfg = Debug|x64
-		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Debug|x64.Build.0 = Debug|x64
-		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Debug|x86.ActiveCfg = Debug|Win32
-		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Debug|x86.Build.0 = Debug|Win32
-		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Release|x64.ActiveCfg = Release|x64
-		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Release|x64.Build.0 = Release|x64
-		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Release|x86.ActiveCfg = Release|Win32
-		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Release|x86.Build.0 = Release|Win32
-		{FEF7868F-750E-4C21-A04D-22707CC66879}.Debug|x64.ActiveCfg = Debug|x64
-		{FEF7868F-750E-4C21-A04D-22707CC66879}.Debug|x64.Build.0 = Debug|x64
-		{FEF7868F-750E-4C21-A04D-22707CC66879}.Debug|x86.ActiveCfg = Debug|Win32
-		{FEF7868F-750E-4C21-A04D-22707CC66879}.Debug|x86.Build.0 = Debug|Win32
-		{FEF7868F-750E-4C21-A04D-22707CC66879}.Release|x64.ActiveCfg = Release|x64
-		{FEF7868F-750E-4C21-A04D-22707CC66879}.Release|x64.Build.0 = Release|x64
-		{FEF7868F-750E-4C21-A04D-22707CC66879}.Release|x86.ActiveCfg = Release|Win32
-		{FEF7868F-750E-4C21-A04D-22707CC66879}.Release|x86.Build.0 = Release|Win32
-		{FEF7958F-750E-4C21-A04D-22707CC66878}.Debug|x64.ActiveCfg = Debug|x64
-		{FEF7958F-750E-4C21-A04D-22707CC66878}.Debug|x64.Build.0 = Debug|x64
-		{FEF7958F-750E-4C21-A04D-22707CC66878}.Debug|x86.ActiveCfg = Debug|Win32
-		{FEF7958F-750E-4C21-A04D-22707CC66878}.Debug|x86.Build.0 = Debug|Win32
-		{FEF7958F-750E-4C21-A04D-22707CC66878}.Release|x64.ActiveCfg = Release|x64
-		{FEF7958F-750E-4C21-A04D-22707CC66878}.Release|x64.Build.0 = Release|x64
-		{FEF7958F-750E-4C21-A04D-22707CC66878}.Release|x86.ActiveCfg = Release|Win32
-		{FEF7958F-750E-4C21-A04D-22707CC66878}.Release|x86.Build.0 = Release|Win32
-	EndGlobalSection
-	GlobalSection(SolutionProperties) = preSolution
-		HideSolutionNode = FALSE
-	EndGlobalSection
-	GlobalSection(ExtensibilityGlobals) = postSolution
-		SolutionGuid = {4297F93D-486A-4243-995F-7D32F59AE82A}
-	EndGlobalSection
-EndGlobal
diff --git a/third-party/mimalloc/ide/vs2017/mimalloc.vcxproj b/third-party/mimalloc/ide/vs2017/mimalloc.vcxproj
deleted file mode 100644
index ece9a14d75..0000000000
--- a/third-party/mimalloc/ide/vs2017/mimalloc.vcxproj
+++ /dev/null
@@ -1,260 +0,0 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <VCProjectVersion>15.0</VCProjectVersion>
-    <ProjectGuid>{ABB5EAE7-B3E6-432E-B636-333449892EA6}</ProjectGuid>
-    <RootNamespace>mimalloc</RootNamespace>
-    <WindowsTargetPlatformVersion>10.0.19041.0</WindowsTargetPlatformVersion>
-    <ProjectName>mimalloc</ProjectName>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>StaticLibrary</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v141</PlatformToolset>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>StaticLibrary</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v141</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>StaticLibrary</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v141</PlatformToolset>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>StaticLibrary</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v141</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="Shared">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-    <TargetExt>.lib</TargetExt>
-    <TargetName>mimalloc-static</TargetName>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-    <TargetExt>.lib</TargetExt>
-    <TargetName>mimalloc-static</TargetName>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-    <TargetExt>.lib</TargetExt>
-    <TargetName>mimalloc-static</TargetName>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-    <TargetExt>.lib</TargetExt>
-    <TargetName>mimalloc-static</TargetName>
-  </PropertyGroup>
-  <PropertyGroup Label="LLVM" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <UseLlvmLib>false</UseLlvmLib>
-  </PropertyGroup>
-  <PropertyGroup Label="LLVM" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <UseLlvmLib>false</UseLlvmLib>
-  </PropertyGroup>
-  <PropertyGroup Label="LLVM" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <UseLlvmLib>false</UseLlvmLib>
-  </PropertyGroup>
-  <PropertyGroup Label="LLVM" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <UseLlvmLib>false</UseLlvmLib>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;MI_DEBUG=3;%(PreprocessorDefinitions);</PreprocessorDefinitions>
-      <CompileAs>CompileAsC</CompileAs>
-      <SupportJustMyCode>false</SupportJustMyCode>
-      <LanguageStandard>stdcpp17</LanguageStandard>
-    </ClCompile>
-    <Lib>
-      <AdditionalLibraryDirectories>
-      </AdditionalLibraryDirectories>
-      <AdditionalDependencies>
-      </AdditionalDependencies>
-    </Lib>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <WarningLevel>Level4</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;MI_DEBUG=3;%(PreprocessorDefinitions);</PreprocessorDefinitions>
-      <CompileAs>CompileAsCpp</CompileAs>
-      <SupportJustMyCode>false</SupportJustMyCode>
-      <LanguageStandard>stdcpp14</LanguageStandard>
-    </ClCompile>
-    <PostBuildEvent>
-      <Command>
-      </Command>
-    </PostBuildEvent>
-    <Link>
-      <EntryPointSymbol>
-      </EntryPointSymbol>
-    </Link>
-    <Lib>
-      <AdditionalLibraryDirectories>
-      </AdditionalLibraryDirectories>
-      <AdditionalDependencies>
-      </AdditionalDependencies>
-    </Lib>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
-      <AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput>
-      <AssemblerListingLocation>$(IntDir)</AssemblerListingLocation>
-      <WholeProgramOptimization>false</WholeProgramOptimization>
-      <BufferSecurityCheck>false</BufferSecurityCheck>
-      <InlineFunctionExpansion>Default</InlineFunctionExpansion>
-      <CompileAs>CompileAsC</CompileAs>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-    </ClCompile>
-    <Link>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-    <Lib>
-      <AdditionalLibraryDirectories>
-      </AdditionalLibraryDirectories>
-      <AdditionalDependencies>
-      </AdditionalDependencies>
-    </Lib>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level4</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
-      <AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput>
-      <AssemblerListingLocation>$(IntDir)</AssemblerListingLocation>
-      <WholeProgramOptimization>false</WholeProgramOptimization>
-      <BufferSecurityCheck>false</BufferSecurityCheck>
-      <InlineFunctionExpansion>Default</InlineFunctionExpansion>
-      <CompileAs>CompileAsC</CompileAs>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-    </ClCompile>
-    <Link>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <EntryPointSymbol>
-      </EntryPointSymbol>
-    </Link>
-    <PostBuildEvent>
-      <Command>
-      </Command>
-    </PostBuildEvent>
-    <Lib>
-      <AdditionalLibraryDirectories>
-      </AdditionalLibraryDirectories>
-      <AdditionalDependencies>
-      </AdditionalDependencies>
-    </Lib>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ClCompile Include="..\..\src\alloc-aligned.c">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">false</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
-    </ClCompile>
-    <ClCompile Include="..\..\src\alloc-override.c">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </ClCompile>
-    <ClCompile Include="..\..\src\alloc-posix.c" />
-    <ClCompile Include="..\..\src\alloc.c" />
-    <ClCompile Include="..\..\src\arena.c" />
-    <ClCompile Include="..\..\src\bitmap.c" />
-    <ClCompile Include="..\..\src\heap.c" />
-    <ClCompile Include="..\..\src\init.c" />
-    <ClCompile Include="..\..\src\libc.c" />
-    <ClCompile Include="..\..\src\prim\prim.c" />
-    <ClCompile Include="..\..\src\options.c" />
-    <ClCompile Include="..\..\src\page-queue.c">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </ClCompile>
-    <ClCompile Include="..\..\src\page.c" />
-    <ClCompile Include="..\..\src\random.c" />
-    <ClCompile Include="..\..\src\segment-map.c" />
-    <ClCompile Include="..\..\src\segment.c" />
-    <ClCompile Include="..\..\src\os.c" />
-    <ClCompile Include="..\..\src\stats.c" />
-  </ItemGroup>
-  <ItemGroup>
-    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc.h" />
-    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc-override.h" />
-    <ClInclude Include="..\..\include\mimalloc-new-delete.h" />
-    <ClInclude Include="..\..\include\mimalloc\atomic.h" />
-    <ClInclude Include="..\..\include\mimalloc\internal.h" />
-    <ClInclude Include="..\..\include\mimalloc\prim.h" />
-    <ClInclude Include="..\..\include\mimalloc\track.h" />
-    <ClInclude Include="..\..\include\mimalloc\types.h" />
-    <ClInclude Include="..\..\src\bitmap.h" />
-  </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
-</Project>
\ No newline at end of file
diff --git a/third-party/mimalloc/ide/vs2019/mimalloc-override-test.vcxproj b/third-party/mimalloc/ide/vs2019/mimalloc-override-test.vcxproj
deleted file mode 100644
index 7a9202f1b1..0000000000
--- a/third-party/mimalloc/ide/vs2019/mimalloc-override-test.vcxproj
+++ /dev/null
@@ -1,190 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <VCProjectVersion>15.0</VCProjectVersion>
-    <ProjectGuid>{FEF7868F-750E-4C21-A04D-22707CC66879}</ProjectGuid>
-    <RootNamespace>mimalloc-override-test</RootNamespace>
-    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
-    <ProjectName>mimalloc-override-test</ProjectName>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="Shared">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
-      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
-      <ExceptionHandling>Sync</ExceptionHandling>
-      <CompileAs>Default</CompileAs>
-      <SupportJustMyCode>false</SupportJustMyCode>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <AdditionalDependencies>kernel32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-    </Link>
-    <PostBuildEvent />
-    <PostBuildEvent>
-      <Command>
-      </Command>
-    </PostBuildEvent>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
-      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
-      <ExceptionHandling>Sync</ExceptionHandling>
-      <CompileAs>Default</CompileAs>
-      <SupportJustMyCode>false</SupportJustMyCode>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <EntryPointSymbol>
-      </EntryPointSymbol>
-      <AdditionalDependencies>kernel32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-    </Link>
-    <PostBuildEvent />
-    <PostBuildEvent>
-      <Command>
-      </Command>
-    </PostBuildEvent>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
-      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
-    </ClCompile>
-    <Link>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <SubSystem>Console</SubSystem>
-      <AdditionalDependencies>kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-    </Link>
-    <PostBuildEvent>
-      <Command>
-      </Command>
-    </PostBuildEvent>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
-      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
-    </ClCompile>
-    <Link>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <SubSystem>Console</SubSystem>
-      <EntryPointSymbol>
-      </EntryPointSymbol>
-      <AdditionalDependencies>kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-    </Link>
-    <PostBuildEvent>
-      <Command>
-      </Command>
-    </PostBuildEvent>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ClCompile Include="..\..\test\main-override.cpp" />
-  </ItemGroup>
-  <ItemGroup>
-    <ProjectReference Include="mimalloc-override.vcxproj">
-      <Project>{abb5eae7-b3e6-432e-b636-333449892ea7}</Project>
-    </ProjectReference>
-  </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
-</Project>
\ No newline at end of file
diff --git a/third-party/mimalloc/ide/vs2019/mimalloc-override.vcxproj b/third-party/mimalloc/ide/vs2019/mimalloc-override.vcxproj
deleted file mode 100644
index a84a517858..0000000000
--- a/third-party/mimalloc/ide/vs2019/mimalloc-override.vcxproj
+++ /dev/null
@@ -1,260 +0,0 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <VCProjectVersion>15.0</VCProjectVersion>
-    <ProjectGuid>{ABB5EAE7-B3E6-432E-B636-333449892EA7}</ProjectGuid>
-    <RootNamespace>mimalloc-override</RootNamespace>
-    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
-    <ProjectName>mimalloc-override</ProjectName>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="Shared">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-    <TargetExt>.dll</TargetExt>
-    <TargetName>mimalloc-override</TargetName>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-    <TargetExt>.dll</TargetExt>
-    <TargetName>mimalloc-override</TargetName>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-    <TargetExt>.dll</TargetExt>
-    <TargetName>mimalloc-override</TargetName>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-    <TargetExt>.dll</TargetExt>
-    <TargetName>mimalloc-override</TargetName>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);</PreprocessorDefinitions>
-      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
-      <SupportJustMyCode>false</SupportJustMyCode>
-      <CompileAs>Default</CompileAs>
-    </ClCompile>
-    <Link>
-      <AdditionalDependencies>$(ProjectDir)\..\..\bin\mimalloc-redirect32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <IgnoreSpecificDefaultLibraries>
-      </IgnoreSpecificDefaultLibraries>
-      <ModuleDefinitionFile>
-      </ModuleDefinitionFile>
-      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
-      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
-    </Link>
-    <PostBuildEvent>
-      <Command>COPY /Y "$(ProjectDir)..\..\bin\mimalloc-redirect32.dll" "$(OutputPath)"</Command>
-    </PostBuildEvent>
-    <PostBuildEvent>
-      <Message>Copy mimalloc-redirect32.dll to the output directory</Message>
-    </PostBuildEvent>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>MI_DEBUG=3;MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);</PreprocessorDefinitions>
-      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
-      <SupportJustMyCode>false</SupportJustMyCode>
-      <CompileAs>Default</CompileAs>
-    </ClCompile>
-    <Link>
-      <AdditionalDependencies>$(ProjectDir)\..\..\bin\mimalloc-redirect.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <IgnoreSpecificDefaultLibraries>
-      </IgnoreSpecificDefaultLibraries>
-      <ModuleDefinitionFile>
-      </ModuleDefinitionFile>
-      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
-      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
-    </Link>
-    <PostBuildEvent>
-      <Command>COPY /Y "$(ProjectDir)..\..\bin\mimalloc-redirect.dll" "$(OutputPath)"</Command>
-    </PostBuildEvent>
-    <PostBuildEvent>
-      <Message>copy mimalloc-redirect.dll to the output directory</Message>
-    </PostBuildEvent>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
-      <AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput>
-      <AssemblerListingLocation>$(IntDir)</AssemblerListingLocation>
-      <WholeProgramOptimization>false</WholeProgramOptimization>
-      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
-      <CompileAs>Default</CompileAs>
-      <BufferSecurityCheck>false</BufferSecurityCheck>
-    </ClCompile>
-    <Link>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>$(ProjectDir)\..\..\bin\mimalloc-redirect32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <ModuleDefinitionFile>
-      </ModuleDefinitionFile>
-      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
-      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
-    </Link>
-    <PostBuildEvent>
-      <Command>COPY /Y "$(ProjectDir)..\..\bin\mimalloc-redirect32.dll" "$(OutputPath)"</Command>
-    </PostBuildEvent>
-    <PostBuildEvent>
-      <Message>Copy mimalloc-redirect32.dll to the output directory</Message>
-    </PostBuildEvent>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
-      <AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput>
-      <AssemblerListingLocation>$(IntDir)</AssemblerListingLocation>
-      <WholeProgramOptimization>false</WholeProgramOptimization>
-      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
-      <CompileAs>Default</CompileAs>
-      <BufferSecurityCheck>false</BufferSecurityCheck>
-    </ClCompile>
-    <Link>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>$(ProjectDir)\..\..\bin\mimalloc-redirect.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <ModuleDefinitionFile>
-      </ModuleDefinitionFile>
-      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
-      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
-    </Link>
-    <PostBuildEvent>
-      <Command>COPY /Y "$(ProjectDir)..\..\bin\mimalloc-redirect.dll" "$(OutputPath)"</Command>
-    </PostBuildEvent>
-    <PostBuildEvent>
-      <Message>copy mimalloc-redirect.dll to the output directory</Message>
-    </PostBuildEvent>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc.h" />
-    <ClInclude Include="..\..\include\mimalloc-new-delete.h" />
-    <ClInclude Include="..\..\include\mimalloc-override.h" />
-    <ClInclude Include="..\..\include\mimalloc\atomic.h" />
-    <ClInclude Include="..\..\include\mimalloc\internal.h" />
-    <ClInclude Include="..\..\include\mimalloc\prim.h" />
-    <ClInclude Include="..\..\include\mimalloc\track.h" />
-    <ClInclude Include="..\..\include\mimalloc\types.h" />
-    <ClInclude Include="..\..\src\bitmap.h" />
-  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="..\..\src\alloc-aligned.c">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">false</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
-    </ClCompile>
-    <ClCompile Include="..\..\src\alloc-override.c">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </ClCompile>
-    <ClCompile Include="..\..\src\alloc-posix.c" />
-    <ClCompile Include="..\..\src\alloc.c" />
-    <ClCompile Include="..\..\src\arena.c" />
-    <ClCompile Include="..\..\src\bitmap.c" />
-    <ClCompile Include="..\..\src\heap.c" />
-    <ClCompile Include="..\..\src\init.c" />
-    <ClCompile Include="..\..\src\libc.c" />
-    <ClCompile Include="..\..\src\prim\prim.c" />
-    <ClCompile Include="..\..\src\options.c" />
-    <ClCompile Include="..\..\src\os.c" />
-    <ClCompile Include="..\..\src\page-queue.c">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </ClCompile>
-    <ClCompile Include="..\..\src\page.c" />
-    <ClCompile Include="..\..\src\random.c" />
-    <ClCompile Include="..\..\src\segment-map.c" />
-    <ClCompile Include="..\..\src\segment.c" />
-    <ClCompile Include="..\..\src\stats.c" />
-  </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
-</Project>
\ No newline at end of file
diff --git a/third-party/mimalloc/ide/vs2019/mimalloc-test-api.vcxproj b/third-party/mimalloc/ide/vs2019/mimalloc-test-api.vcxproj
deleted file mode 100644
index 812a9cb116..0000000000
--- a/third-party/mimalloc/ide/vs2019/mimalloc-test-api.vcxproj
+++ /dev/null
@@ -1,155 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <VCProjectVersion>15.0</VCProjectVersion>
-    <ProjectGuid>{FFF7958F-750E-4C21-A04D-22707CC66878}</ProjectGuid>
-    <RootNamespace>mimalloc-test-api</RootNamespace>
-    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
-    <ProjectName>mimalloc-test-api</ProjectName>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="Shared">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
-    </ClCompile>
-    <Link>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <SubSystem>Console</SubSystem>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
-    </ClCompile>
-    <Link>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <SubSystem>Console</SubSystem>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ClCompile Include="..\..\test\test-api.c">
-    </ClCompile>
-  </ItemGroup>
-  <ItemGroup>
-    <ProjectReference Include="mimalloc.vcxproj">
-      <Project>{abb5eae7-b3e6-432e-b636-333449892ea6}</Project>
-    </ProjectReference>
-  </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
-</Project>
diff --git a/third-party/mimalloc/ide/vs2019/mimalloc-test-stress.vcxproj b/third-party/mimalloc/ide/vs2019/mimalloc-test-stress.vcxproj
deleted file mode 100644
index ef7ab3575a..0000000000
--- a/third-party/mimalloc/ide/vs2019/mimalloc-test-stress.vcxproj
+++ /dev/null
@@ -1,159 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <VCProjectVersion>15.0</VCProjectVersion>
-    <ProjectGuid>{FEF7958F-750E-4C21-A04D-22707CC66878}</ProjectGuid>
-    <RootNamespace>mimalloc-test-stress</RootNamespace>
-    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
-    <ProjectName>mimalloc-test-stress</ProjectName>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="Shared">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
-    </ClCompile>
-    <Link>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <SubSystem>Console</SubSystem>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
-    </ClCompile>
-    <Link>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <SubSystem>Console</SubSystem>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ClCompile Include="..\..\test\test-stress.c">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">false</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
-    </ClCompile>
-  </ItemGroup>
-  <ItemGroup>
-    <ProjectReference Include="mimalloc.vcxproj">
-      <Project>{abb5eae7-b3e6-432e-b636-333449892ea6}</Project>
-    </ProjectReference>
-  </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
-</Project>
\ No newline at end of file
diff --git a/third-party/mimalloc/ide/vs2019/mimalloc-test.vcxproj b/third-party/mimalloc/ide/vs2019/mimalloc-test.vcxproj
deleted file mode 100644
index 13af6ab495..0000000000
--- a/third-party/mimalloc/ide/vs2019/mimalloc-test.vcxproj
+++ /dev/null
@@ -1,158 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <VCProjectVersion>15.0</VCProjectVersion>
-    <ProjectGuid>{FEF7858F-750E-4C21-A04D-22707CC66878}</ProjectGuid>
-    <RootNamespace>mimalloctest</RootNamespace>
-    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
-    <ProjectName>mimalloc-test</ProjectName>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="Shared">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
-      <LanguageStandard>stdcpp17</LanguageStandard>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
-      <LanguageStandard>stdcpp17</LanguageStandard>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
-      <LanguageStandard>stdcpp17</LanguageStandard>
-    </ClCompile>
-    <Link>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <SubSystem>Console</SubSystem>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
-      <LanguageStandard>stdcpp17</LanguageStandard>
-    </ClCompile>
-    <Link>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <SubSystem>Console</SubSystem>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ProjectReference Include="mimalloc.vcxproj">
-      <Project>{abb5eae7-b3e6-432e-b636-333449892ea6}</Project>
-    </ProjectReference>
-  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="..\..\test\main-override-static.c" />
-  </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
-</Project>
\ No newline at end of file
diff --git a/third-party/mimalloc/ide/vs2019/mimalloc.sln b/third-party/mimalloc/ide/vs2019/mimalloc.sln
deleted file mode 100644
index 6ff01d3b44..0000000000
--- a/third-party/mimalloc/ide/vs2019/mimalloc.sln
+++ /dev/null
@@ -1,81 +0,0 @@
-﻿
-Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio Version 16
-VisualStudioVersion = 16.0.29709.97
-MinimumVisualStudioVersion = 10.0.40219.1
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc", "mimalloc.vcxproj", "{ABB5EAE7-B3E6-432E-B636-333449892EA6}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc-test", "mimalloc-test.vcxproj", "{FEF7858F-750E-4C21-A04D-22707CC66878}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc-override", "mimalloc-override.vcxproj", "{ABB5EAE7-B3E6-432E-B636-333449892EA7}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc-override-test", "mimalloc-override-test.vcxproj", "{FEF7868F-750E-4C21-A04D-22707CC66879}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc-test-stress", "mimalloc-test-stress.vcxproj", "{FEF7958F-750E-4C21-A04D-22707CC66878}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc-test-api", "mimalloc-test-api.vcxproj", "{FFF7958F-750E-4C21-A04D-22707CC66878}"
-EndProject
-Global
-	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug|x64 = Debug|x64
-		Debug|x86 = Debug|x86
-		Release|x64 = Release|x64
-		Release|x86 = Release|x86
-	EndGlobalSection
-	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Debug|x64.ActiveCfg = Debug|x64
-		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Debug|x64.Build.0 = Debug|x64
-		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Debug|x86.ActiveCfg = Debug|Win32
-		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Debug|x86.Build.0 = Debug|Win32
-		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Release|x64.ActiveCfg = Release|x64
-		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Release|x64.Build.0 = Release|x64
-		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Release|x86.ActiveCfg = Release|Win32
-		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Release|x86.Build.0 = Release|Win32
-		{FEF7858F-750E-4C21-A04D-22707CC66878}.Debug|x64.ActiveCfg = Debug|x64
-		{FEF7858F-750E-4C21-A04D-22707CC66878}.Debug|x64.Build.0 = Debug|x64
-		{FEF7858F-750E-4C21-A04D-22707CC66878}.Debug|x86.ActiveCfg = Debug|Win32
-		{FEF7858F-750E-4C21-A04D-22707CC66878}.Debug|x86.Build.0 = Debug|Win32
-		{FEF7858F-750E-4C21-A04D-22707CC66878}.Release|x64.ActiveCfg = Release|x64
-		{FEF7858F-750E-4C21-A04D-22707CC66878}.Release|x64.Build.0 = Release|x64
-		{FEF7858F-750E-4C21-A04D-22707CC66878}.Release|x86.ActiveCfg = Release|Win32
-		{FEF7858F-750E-4C21-A04D-22707CC66878}.Release|x86.Build.0 = Release|Win32
-		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Debug|x64.ActiveCfg = Debug|x64
-		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Debug|x64.Build.0 = Debug|x64
-		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Debug|x86.ActiveCfg = Debug|Win32
-		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Debug|x86.Build.0 = Debug|Win32
-		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Release|x64.ActiveCfg = Release|x64
-		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Release|x64.Build.0 = Release|x64
-		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Release|x86.ActiveCfg = Release|Win32
-		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Release|x86.Build.0 = Release|Win32
-		{FEF7868F-750E-4C21-A04D-22707CC66879}.Debug|x64.ActiveCfg = Debug|x64
-		{FEF7868F-750E-4C21-A04D-22707CC66879}.Debug|x64.Build.0 = Debug|x64
-		{FEF7868F-750E-4C21-A04D-22707CC66879}.Debug|x86.ActiveCfg = Debug|Win32
-		{FEF7868F-750E-4C21-A04D-22707CC66879}.Debug|x86.Build.0 = Debug|Win32
-		{FEF7868F-750E-4C21-A04D-22707CC66879}.Release|x64.ActiveCfg = Release|x64
-		{FEF7868F-750E-4C21-A04D-22707CC66879}.Release|x64.Build.0 = Release|x64
-		{FEF7868F-750E-4C21-A04D-22707CC66879}.Release|x86.ActiveCfg = Release|Win32
-		{FEF7868F-750E-4C21-A04D-22707CC66879}.Release|x86.Build.0 = Release|Win32
-		{FEF7958F-750E-4C21-A04D-22707CC66878}.Debug|x64.ActiveCfg = Debug|x64
-		{FEF7958F-750E-4C21-A04D-22707CC66878}.Debug|x64.Build.0 = Debug|x64
-		{FEF7958F-750E-4C21-A04D-22707CC66878}.Debug|x86.ActiveCfg = Debug|Win32
-		{FEF7958F-750E-4C21-A04D-22707CC66878}.Debug|x86.Build.0 = Debug|Win32
-		{FEF7958F-750E-4C21-A04D-22707CC66878}.Release|x64.ActiveCfg = Release|x64
-		{FEF7958F-750E-4C21-A04D-22707CC66878}.Release|x64.Build.0 = Release|x64
-		{FEF7958F-750E-4C21-A04D-22707CC66878}.Release|x86.ActiveCfg = Release|Win32
-		{FEF7958F-750E-4C21-A04D-22707CC66878}.Release|x86.Build.0 = Release|Win32
-		{FFF7958F-750E-4C21-A04D-22707CC66878}.Debug|x64.ActiveCfg = Debug|x64
-		{FFF7958F-750E-4C21-A04D-22707CC66878}.Debug|x64.Build.0 = Debug|x64
-		{FFF7958F-750E-4C21-A04D-22707CC66878}.Debug|x86.ActiveCfg = Debug|Win32
-		{FFF7958F-750E-4C21-A04D-22707CC66878}.Debug|x86.Build.0 = Debug|Win32
-		{FFF7958F-750E-4C21-A04D-22707CC66878}.Release|x64.ActiveCfg = Release|x64
-		{FFF7958F-750E-4C21-A04D-22707CC66878}.Release|x64.Build.0 = Release|x64
-		{FFF7958F-750E-4C21-A04D-22707CC66878}.Release|x86.ActiveCfg = Release|Win32
-		{FFF7958F-750E-4C21-A04D-22707CC66878}.Release|x86.Build.0 = Release|Win32
-	EndGlobalSection
-	GlobalSection(SolutionProperties) = preSolution
-		HideSolutionNode = FALSE
-	EndGlobalSection
-	GlobalSection(ExtensibilityGlobals) = postSolution
-		SolutionGuid = {4297F93D-486A-4243-995F-7D32F59AE82A}
-	EndGlobalSection
-EndGlobal
diff --git a/third-party/mimalloc/ide/vs2019/mimalloc.vcxproj b/third-party/mimalloc/ide/vs2019/mimalloc.vcxproj
deleted file mode 100644
index 0076b1dbdd..0000000000
--- a/third-party/mimalloc/ide/vs2019/mimalloc.vcxproj
+++ /dev/null
@@ -1,258 +0,0 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <VCProjectVersion>15.0</VCProjectVersion>
-    <ProjectGuid>{ABB5EAE7-B3E6-432E-B636-333449892EA6}</ProjectGuid>
-    <RootNamespace>mimalloc</RootNamespace>
-    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
-    <ProjectName>mimalloc</ProjectName>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>StaticLibrary</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>StaticLibrary</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>StaticLibrary</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>StaticLibrary</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="Shared">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-    <TargetExt>.lib</TargetExt>
-    <TargetName>mimalloc-static</TargetName>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-    <TargetExt>.lib</TargetExt>
-    <TargetName>mimalloc-static</TargetName>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-    <TargetExt>.lib</TargetExt>
-    <TargetName>mimalloc-static</TargetName>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
-    <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
-    <TargetExt>.lib</TargetExt>
-    <TargetName>mimalloc-static</TargetName>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <WarningLevel>Level4</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>MI_DEBUG=3;%(PreprocessorDefinitions);</PreprocessorDefinitions>
-      <CompileAs>CompileAsCpp</CompileAs>
-      <SupportJustMyCode>false</SupportJustMyCode>
-      <LanguageStandard>Default</LanguageStandard>
-    </ClCompile>
-    <Lib>
-      <AdditionalLibraryDirectories>
-      </AdditionalLibraryDirectories>
-      <AdditionalDependencies>
-      </AdditionalDependencies>
-    </Lib>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <WarningLevel>Level4</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <SDLCheck>true</SDLCheck>
-      <ConformanceMode>Default</ConformanceMode>
-      <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>MI_DEBUG=3;%(PreprocessorDefinitions);</PreprocessorDefinitions>
-      <CompileAs>CompileAsCpp</CompileAs>
-      <SupportJustMyCode>false</SupportJustMyCode>
-      <LanguageStandard>Default</LanguageStandard>
-    </ClCompile>
-    <PostBuildEvent>
-      <Command>
-      </Command>
-    </PostBuildEvent>
-    <Link>
-      <EntryPointSymbol>
-      </EntryPointSymbol>
-    </Link>
-    <Lib>
-      <AdditionalLibraryDirectories>
-      </AdditionalLibraryDirectories>
-      <AdditionalDependencies>
-      </AdditionalDependencies>
-    </Lib>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level4</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
-      <AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput>
-      <AssemblerListingLocation>$(IntDir)</AssemblerListingLocation>
-      <WholeProgramOptimization>false</WholeProgramOptimization>
-      <BufferSecurityCheck>false</BufferSecurityCheck>
-      <InlineFunctionExpansion>Default</InlineFunctionExpansion>
-      <CompileAs>CompileAsCpp</CompileAs>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <LanguageStandard>Default</LanguageStandard>
-    </ClCompile>
-    <Link>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-    <Lib>
-      <AdditionalLibraryDirectories>
-      </AdditionalLibraryDirectories>
-      <AdditionalDependencies>
-      </AdditionalDependencies>
-    </Lib>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level4</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <ConformanceMode>true</ConformanceMode>
-      <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
-      <AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput>
-      <AssemblerListingLocation>$(IntDir)</AssemblerListingLocation>
-      <WholeProgramOptimization>false</WholeProgramOptimization>
-      <BufferSecurityCheck>false</BufferSecurityCheck>
-      <InlineFunctionExpansion>Default</InlineFunctionExpansion>
-      <CompileAs>CompileAsCpp</CompileAs>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <LanguageStandard>Default</LanguageStandard>
-    </ClCompile>
-    <Link>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <EntryPointSymbol>
-      </EntryPointSymbol>
-    </Link>
-    <PostBuildEvent>
-      <Command>
-      </Command>
-    </PostBuildEvent>
-    <Lib>
-      <AdditionalLibraryDirectories>
-      </AdditionalLibraryDirectories>
-      <AdditionalDependencies>
-      </AdditionalDependencies>
-    </Lib>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ClCompile Include="..\..\src\alloc-aligned.c">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">false</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
-    </ClCompile>
-    <ClCompile Include="..\..\src\alloc-override.c">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </ClCompile>
-    <ClCompile Include="..\..\src\alloc-posix.c" />
-    <ClCompile Include="..\..\src\alloc.c" />
-    <ClCompile Include="..\..\src\arena.c" />
-    <ClCompile Include="..\..\src\bitmap.c">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
-    </ClCompile>
-    <ClCompile Include="..\..\src\heap.c" />
-    <ClCompile Include="..\..\src\init.c" />
-    <ClCompile Include="..\..\src\libc.c" />
-    <ClCompile Include="..\..\src\prim\prim.c" />
-    <ClCompile Include="..\..\src\prim\windows\prim.c">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </ClCompile>
-    <ClCompile Include="..\..\src\options.c" />
-    <ClCompile Include="..\..\src\page-queue.c">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </ClCompile>
-    <ClCompile Include="..\..\src\page.c" />
-    <ClCompile Include="..\..\src\random.c" />
-    <ClCompile Include="..\..\src\segment-map.c" />
-    <ClCompile Include="..\..\src\segment.c" />
-    <ClCompile Include="..\..\src\os.c" />
-    <ClCompile Include="..\..\src\stats.c" />
-  </ItemGroup>
-  <ItemGroup>
-    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc.h" />
-    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc-override.h" />
-    <ClInclude Include="..\..\include\mimalloc-new-delete.h" />
-    <ClInclude Include="..\..\include\mimalloc\atomic.h" />
-    <ClInclude Include="..\..\include\mimalloc\internal.h" />
-    <ClInclude Include="..\..\include\mimalloc\prim.h" />
-    <ClInclude Include="..\..\include\mimalloc\track.h" />
-    <ClInclude Include="..\..\include\mimalloc\types.h" />
-    <ClInclude Include="..\..\src\bitmap.h" />
-  </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
-</Project>
\ No newline at end of file
diff --git a/third-party/mimalloc/ide/vs2022/mimalloc.vcxproj b/third-party/mimalloc/ide/vs2022/mimalloc-lib.vcxproj
similarity index 51%
rename from third-party/mimalloc/ide/vs2022/mimalloc.vcxproj
rename to third-party/mimalloc/ide/vs2022/mimalloc-lib.vcxproj
index 33ad9cef13..9009b99f1a 100644
--- a/third-party/mimalloc/ide/vs2022/mimalloc.vcxproj
+++ b/third-party/mimalloc/ide/vs2022/mimalloc-lib.vcxproj
@@ -1,10 +1,26 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|ARM64">
+      <Configuration>Debug</Configuration>
+      <Platform>ARM64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|ARM64EC">
+      <Configuration>Debug</Configuration>
+      <Platform>ARM64EC</Platform>
+    </ProjectConfiguration>
     <ProjectConfiguration Include="Debug|Win32">
       <Configuration>Debug</Configuration>
       <Platform>Win32</Platform>
     </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|ARM64">
+      <Configuration>Release</Configuration>
+      <Platform>ARM64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|ARM64EC">
+      <Configuration>Release</Configuration>
+      <Platform>ARM64EC</Platform>
+    </ProjectConfiguration>
     <ProjectConfiguration Include="Release|Win32">
       <Configuration>Release</Configuration>
       <Platform>Win32</Platform>
@@ -21,9 +37,9 @@
   <PropertyGroup Label="Globals">
     <VCProjectVersion>15.0</VCProjectVersion>
     <ProjectGuid>{ABB5EAE7-B3E6-432E-B636-333449892EA6}</ProjectGuid>
-    <RootNamespace>mimalloc</RootNamespace>
+    <RootNamespace>mimalloc-lib</RootNamespace>
     <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
-    <ProjectName>mimalloc</ProjectName>
+    <ProjectName>mimalloc-lib</ProjectName>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
@@ -42,12 +58,34 @@
     <UseDebugLibraries>true</UseDebugLibraries>
     <PlatformToolset>v143</PlatformToolset>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <ConfigurationType>StaticLibrary</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
     <PlatformToolset>v143</PlatformToolset>
     <WholeProgramOptimization>true</WholeProgramOptimization>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
   </ImportGroup>
@@ -62,9 +100,21 @@
   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
   <PropertyGroup Label="UserMacros" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
@@ -84,12 +134,36 @@
     <TargetExt>.lib</TargetExt>
     <TargetName>mimalloc-static</TargetName>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">
+    <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
+    <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
+    <TargetExt>.lib</TargetExt>
+    <TargetName>mimalloc-static</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">
+    <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
+    <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
+    <TargetExt>.lib</TargetExt>
+    <TargetName>mimalloc-static</TargetName>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
     <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
     <TargetExt>.lib</TargetExt>
     <TargetName>mimalloc-static</TargetName>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">
+    <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
+    <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
+    <TargetExt>.lib</TargetExt>
+    <TargetName>mimalloc-static</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">
+    <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
+    <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
+    <TargetExt>.lib</TargetExt>
+    <TargetName>mimalloc-static</TargetName>
+  </PropertyGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <ClCompile>
       <WarningLevel>Level4</WarningLevel>
@@ -116,7 +190,61 @@
       <SDLCheck>true</SDLCheck>
       <ConformanceMode>Default</ConformanceMode>
       <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>MI_DEBUG=4;MI_SECURE=0;%(PreprocessorDefinitions);</PreprocessorDefinitions>
+      <PreprocessorDefinitions>MI_DEBUG=3;MI_GUARDED=0;%(PreprocessorDefinitions);</PreprocessorDefinitions>
+      <CompileAs>CompileAsCpp</CompileAs>
+      <SupportJustMyCode>false</SupportJustMyCode>
+      <LanguageStandard>stdcpp20</LanguageStandard>
+    </ClCompile>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+    <Link>
+      <EntryPointSymbol>
+      </EntryPointSymbol>
+    </Link>
+    <Lib>
+      <AdditionalLibraryDirectories>
+      </AdditionalLibraryDirectories>
+      <AdditionalDependencies>
+      </AdditionalDependencies>
+    </Lib>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">
+    <ClCompile>
+      <WarningLevel>Level4</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>Default</ConformanceMode>
+      <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>MI_DEBUG=3;MI_GUARDED=0;%(PreprocessorDefinitions);</PreprocessorDefinitions>
+      <CompileAs>CompileAsCpp</CompileAs>
+      <SupportJustMyCode>false</SupportJustMyCode>
+      <LanguageStandard>stdcpp20</LanguageStandard>
+    </ClCompile>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+    <Link>
+      <EntryPointSymbol>
+      </EntryPointSymbol>
+    </Link>
+    <Lib>
+      <AdditionalLibraryDirectories>
+      </AdditionalLibraryDirectories>
+      <AdditionalDependencies>
+      </AdditionalDependencies>
+    </Lib>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">
+    <ClCompile>
+      <WarningLevel>Level4</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>Default</ConformanceMode>
+      <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>MI_DEBUG=3;MI_GUARDED=0;%(PreprocessorDefinitions);</PreprocessorDefinitions>
       <CompileAs>CompileAsCpp</CompileAs>
       <SupportJustMyCode>false</SupportJustMyCode>
       <LanguageStandard>stdcpp20</LanguageStandard>
@@ -198,30 +326,124 @@
       </AdditionalDependencies>
     </Lib>
   </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">
+    <ClCompile>
+      <WarningLevel>Level4</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <ConformanceMode>Default</ConformanceMode>
+      <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
+      <AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput>
+      <AssemblerListingLocation>$(IntDir)</AssemblerListingLocation>
+      <WholeProgramOptimization>false</WholeProgramOptimization>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <InlineFunctionExpansion>Default</InlineFunctionExpansion>
+      <CompileAs>CompileAsCpp</CompileAs>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <LanguageStandard>stdcpp20</LanguageStandard>
+      <EnableEnhancedInstructionSet>CPUExtensionRequirementsARMv81</EnableEnhancedInstructionSet>
+      <ExceptionHandling>Sync</ExceptionHandling>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <EntryPointSymbol>
+      </EntryPointSymbol>
+    </Link>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+    <Lib>
+      <AdditionalLibraryDirectories>
+      </AdditionalLibraryDirectories>
+      <AdditionalDependencies>
+      </AdditionalDependencies>
+    </Lib>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">
+    <ClCompile>
+      <WarningLevel>Level4</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <ConformanceMode>Default</ConformanceMode>
+      <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
+      <AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput>
+      <AssemblerListingLocation>$(IntDir)</AssemblerListingLocation>
+      <WholeProgramOptimization>false</WholeProgramOptimization>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <InlineFunctionExpansion>Default</InlineFunctionExpansion>
+      <CompileAs>CompileAsCpp</CompileAs>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <LanguageStandard>stdcpp20</LanguageStandard>
+      <EnableEnhancedInstructionSet>CPUExtensionRequirementsARMv81</EnableEnhancedInstructionSet>
+      <ExceptionHandling>Sync</ExceptionHandling>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <EntryPointSymbol>
+      </EntryPointSymbol>
+    </Link>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+    <Lib>
+      <AdditionalLibraryDirectories>
+      </AdditionalLibraryDirectories>
+      <AdditionalDependencies>
+      </AdditionalDependencies>
+    </Lib>
+  </ItemDefinitionGroup>
   <ItemGroup>
     <ClCompile Include="..\..\src\alloc-aligned.c">
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">false</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">false</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">false</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">false</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">false</ExcludedFromBuild>
     </ClCompile>
     <ClCompile Include="..\..\src\alloc-override.c">
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">true</ExcludedFromBuild>
     </ClCompile>
     <ClCompile Include="..\..\src\alloc-posix.c" />
     <ClCompile Include="..\..\src\alloc.c" />
+    <ClCompile Include="..\..\src\arena-abandon.c">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">true</ExcludedFromBuild>
+    </ClCompile>
     <ClCompile Include="..\..\src\arena.c" />
     <ClCompile Include="..\..\src\bitmap.c">
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">false</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">false</ExcludedFromBuild>
     </ClCompile>
     <ClCompile Include="..\..\src\free.c">
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">true</ExcludedFromBuild>
     </ClCompile>
     <ClCompile Include="..\..\src\heap.c" />
     <ClCompile Include="..\..\src\init.c" />
@@ -231,14 +453,22 @@
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">true</ExcludedFromBuild>
     </ClCompile>
     <ClCompile Include="..\..\src\options.c" />
     <ClCompile Include="..\..\src\page-queue.c">
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">true</ExcludedFromBuild>
     </ClCompile>
     <ClCompile Include="..\..\src\page.c" />
     <ClCompile Include="..\..\src\random.c" />
diff --git a/third-party/mimalloc/ide/vs2022/mimalloc-lib.vcxproj.filters b/third-party/mimalloc/ide/vs2022/mimalloc-lib.vcxproj.filters
new file mode 100644
index 0000000000..90703da888
--- /dev/null
+++ b/third-party/mimalloc/ide/vs2022/mimalloc-lib.vcxproj.filters
@@ -0,0 +1,105 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <ClCompile Include="..\..\src\alloc.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\alloc-aligned.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\alloc-override.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\alloc-posix.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\arena.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\arena-abandoned.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\bitmap.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\free.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\heap.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\init.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\libc.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\options.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\os.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\page.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\page-queue.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\prim\windows\prim.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\prim\prim.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\random.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\segment.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\segment-map.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\stats.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\include\mimalloc\atomic.h">
+      <Filter>Headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\bitmap.h">
+      <Filter>Headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc\internal.h">
+      <Filter>Headers</Filter>
+    </ClInclude>
+    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc.h">
+      <Filter>Headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc-new-delete.h">
+      <Filter>Headers</Filter>
+    </ClInclude>
+    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc-override.h">
+      <Filter>Headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc\track.h">
+      <Filter>Headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc\types.h">
+      <Filter>Headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc\prim.h">
+      <Filter>Headers</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <Filter Include="Headers">
+      <UniqueIdentifier>{1430490c-e711-4ace-a1b8-36f4d5105873}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Sources">
+      <UniqueIdentifier>{461c78ef-04b0-44d1-a0ca-7d488abaa592}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/third-party/mimalloc/ide/vs2022/mimalloc-override.vcxproj b/third-party/mimalloc/ide/vs2022/mimalloc-override-dll.vcxproj
similarity index 51%
rename from third-party/mimalloc/ide/vs2022/mimalloc-override.vcxproj
rename to third-party/mimalloc/ide/vs2022/mimalloc-override-dll.vcxproj
index df2a081690..c1849bb261 100644
--- a/third-party/mimalloc/ide/vs2022/mimalloc-override.vcxproj
+++ b/third-party/mimalloc/ide/vs2022/mimalloc-override-dll.vcxproj
@@ -1,10 +1,26 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|ARM64">
+      <Configuration>Debug</Configuration>
+      <Platform>ARM64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|ARM64EC">
+      <Configuration>Debug</Configuration>
+      <Platform>ARM64EC</Platform>
+    </ProjectConfiguration>
     <ProjectConfiguration Include="Debug|Win32">
       <Configuration>Debug</Configuration>
       <Platform>Win32</Platform>
     </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|ARM64">
+      <Configuration>Release</Configuration>
+      <Platform>ARM64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|ARM64EC">
+      <Configuration>Release</Configuration>
+      <Platform>ARM64EC</Platform>
+    </ProjectConfiguration>
     <ProjectConfiguration Include="Release|Win32">
       <Configuration>Release</Configuration>
       <Platform>Win32</Platform>
@@ -21,9 +37,9 @@
   <PropertyGroup Label="Globals">
     <VCProjectVersion>15.0</VCProjectVersion>
     <ProjectGuid>{ABB5EAE7-B3E6-432E-B636-333449892EA7}</ProjectGuid>
-    <RootNamespace>mimalloc-override</RootNamespace>
+    <RootNamespace>mimalloc-override-dll</RootNamespace>
     <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
-    <ProjectName>mimalloc-override</ProjectName>
+    <ProjectName>mimalloc-override-dll</ProjectName>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
@@ -41,11 +57,31 @@
     <UseDebugLibraries>true</UseDebugLibraries>
     <PlatformToolset>v143</PlatformToolset>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <ConfigurationType>DynamicLibrary</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
     <PlatformToolset>v143</PlatformToolset>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+  </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
   </ImportGroup>
@@ -60,33 +96,69 @@
   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
   <PropertyGroup Label="UserMacros" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
     <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
     <TargetExt>.dll</TargetExt>
-    <TargetName>mimalloc-override</TargetName>
+    <TargetName>mimalloc</TargetName>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
     <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
     <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
     <TargetExt>.dll</TargetExt>
-    <TargetName>mimalloc-override</TargetName>
+    <TargetName>mimalloc</TargetName>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
     <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
     <TargetExt>.dll</TargetExt>
-    <TargetName>mimalloc-override</TargetName>
+    <TargetName>mimalloc</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">
+    <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
+    <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
+    <TargetExt>.dll</TargetExt>
+    <TargetName>mimalloc</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">
+    <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
+    <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
+    <TargetExt>.dll</TargetExt>
+    <TargetName>mimalloc</TargetName>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
     <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
     <TargetExt>.dll</TargetExt>
-    <TargetName>mimalloc-override</TargetName>
+    <TargetName>mimalloc</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">
+    <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
+    <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
+    <TargetExt>.dll</TargetExt>
+    <TargetName>mimalloc</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">
+    <OutDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
+    <IntDir>$(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
+    <TargetExt>.dll</TargetExt>
+    <TargetName>mimalloc</TargetName>
   </PropertyGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <ClCompile>
@@ -98,7 +170,7 @@
       <PreprocessorDefinitions>MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);</PreprocessorDefinitions>
       <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
       <SupportJustMyCode>false</SupportJustMyCode>
-      <CompileAs>Default</CompileAs>
+      <CompileAs>CompileAsCpp</CompileAs>
     </ClCompile>
     <Link>
       <AdditionalDependencies>$(ProjectDir)\..\..\bin\mimalloc-redirect32.lib;%(AdditionalDependencies)</AdditionalDependencies>
@@ -126,7 +198,7 @@
       <PreprocessorDefinitions>MI_DEBUG=4;MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);</PreprocessorDefinitions>
       <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
       <SupportJustMyCode>false</SupportJustMyCode>
-      <CompileAs>Default</CompileAs>
+      <CompileAs>CompileAsCpp</CompileAs>
     </ClCompile>
     <Link>
       <AdditionalDependencies>$(ProjectDir)\..\..\bin\mimalloc-redirect.lib;%(AdditionalDependencies)</AdditionalDependencies>
@@ -144,6 +216,62 @@
       <Message>copy mimalloc-redirect.dll to the output directory</Message>
     </PostBuildEvent>
   </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>MI_DEBUG=4;MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);</PreprocessorDefinitions>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <SupportJustMyCode>false</SupportJustMyCode>
+      <CompileAs>CompileAsCpp</CompileAs>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>$(ProjectDir)\..\..\bin\mimalloc-redirect-arm64.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <IgnoreSpecificDefaultLibraries>
+      </IgnoreSpecificDefaultLibraries>
+      <ModuleDefinitionFile>
+      </ModuleDefinitionFile>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
+    </Link>
+    <PostBuildEvent>
+      <Command>COPY /Y "$(ProjectDir)..\..\bin\mimalloc-redirect-arm64.dll" "$(OutputPath)"</Command>
+    </PostBuildEvent>
+    <PostBuildEvent>
+      <Message>copy mimalloc-redirect-arm64.dll to the output directory</Message>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>MI_DEBUG=4;MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);</PreprocessorDefinitions>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <SupportJustMyCode>false</SupportJustMyCode>
+      <CompileAs>CompileAsCpp</CompileAs>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>$(ProjectDir)\..\..\bin\mimalloc-redirect-arm64ec.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <IgnoreSpecificDefaultLibraries>
+      </IgnoreSpecificDefaultLibraries>
+      <ModuleDefinitionFile>
+      </ModuleDefinitionFile>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
+    </Link>
+    <PostBuildEvent>
+      <Command>COPY /Y "$(ProjectDir)..\..\bin\mimalloc-redirect-arm64ec.dll" "$(OutputPath)"</Command>
+    </PostBuildEvent>
+    <PostBuildEvent>
+      <Message>copy mimalloc-redirect-arm64ec.dll to the output directory</Message>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
     <ClCompile>
       <WarningLevel>Level3</WarningLevel>
@@ -157,7 +285,7 @@
       <AssemblerListingLocation>$(IntDir)</AssemblerListingLocation>
       <WholeProgramOptimization>false</WholeProgramOptimization>
       <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
-      <CompileAs>Default</CompileAs>
+      <CompileAs>CompileAsCpp</CompileAs>
       <BufferSecurityCheck>false</BufferSecurityCheck>
     </ClCompile>
     <Link>
@@ -189,7 +317,7 @@
       <AssemblerListingLocation>$(IntDir)</AssemblerListingLocation>
       <WholeProgramOptimization>false</WholeProgramOptimization>
       <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
-      <CompileAs>Default</CompileAs>
+      <CompileAs>CompileAsCpp</CompileAs>
       <BufferSecurityCheck>false</BufferSecurityCheck>
     </ClCompile>
     <Link>
@@ -208,6 +336,72 @@
       <Message>copy mimalloc-redirect.dll to the output directory</Message>
     </PostBuildEvent>
   </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <ConformanceMode>true</ConformanceMode>
+      <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
+      <AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput>
+      <AssemblerListingLocation>$(IntDir)</AssemblerListingLocation>
+      <WholeProgramOptimization>false</WholeProgramOptimization>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+      <CompileAs>CompileAsCpp</CompileAs>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <EnableEnhancedInstructionSet>CPUExtensionRequirementsARMv81</EnableEnhancedInstructionSet>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalDependencies>$(ProjectDir)\..\..\bin\mimalloc-redirect-arm64.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <ModuleDefinitionFile>
+      </ModuleDefinitionFile>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
+    </Link>
+    <PostBuildEvent>
+      <Command>COPY /Y "$(ProjectDir)..\..\bin\mimalloc-redirect-arm64.dll" "$(OutputPath)"</Command>
+    </PostBuildEvent>
+    <PostBuildEvent>
+      <Message>copy mimalloc-redirect-arm64.dll to the output directory</Message>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <ConformanceMode>true</ConformanceMode>
+      <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
+      <AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput>
+      <AssemblerListingLocation>$(IntDir)</AssemblerListingLocation>
+      <WholeProgramOptimization>false</WholeProgramOptimization>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+      <CompileAs>CompileAsCpp</CompileAs>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <EnableEnhancedInstructionSet>CPUExtensionRequirementsARMv81</EnableEnhancedInstructionSet>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalDependencies>$(ProjectDir)\..\..\bin\mimalloc-redirect-arm64ec.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <ModuleDefinitionFile>
+      </ModuleDefinitionFile>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+      <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
+    </Link>
+    <PostBuildEvent>
+      <Command>COPY /Y "$(ProjectDir)..\..\bin\mimalloc-redirect-arm64ec.dll" "$(OutputPath)"</Command>
+    </PostBuildEvent>
+    <PostBuildEvent>
+      <Message>copy mimalloc-redirect-arm64ec.dll to the output directory</Message>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
   <ItemGroup>
     <ClInclude Include="$(ProjectDir)..\..\include\mimalloc.h" />
     <ClInclude Include="..\..\include\mimalloc-etw-gen.h" />
@@ -226,16 +420,34 @@
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">false</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">false</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">false</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">false</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">false</ExcludedFromBuild>
     </ClCompile>
     <ClCompile Include="..\..\src\alloc-override.c">
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">true</ExcludedFromBuild>
     </ClCompile>
     <ClCompile Include="..\..\src\alloc-posix.c" />
     <ClCompile Include="..\..\src\alloc.c" />
+    <ClCompile Include="..\..\src\arena-abandoned.c">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">true</ExcludedFromBuild>
+    </ClCompile>
     <ClCompile Include="..\..\src\arena.c" />
     <ClCompile Include="..\..\src\bitmap.c" />
     <ClCompile Include="..\..\src\heap.c" />
@@ -246,7 +458,11 @@
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">true</ExcludedFromBuild>
     </ClCompile>
     <ClCompile Include="..\..\src\options.c" />
     <ClCompile Include="..\..\src\os.c" />
@@ -254,7 +470,11 @@
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">true</ExcludedFromBuild>
     </ClCompile>
     <ClCompile Include="..\..\src\page.c" />
     <ClCompile Include="..\..\src\random.c" />
diff --git a/third-party/mimalloc/ide/vs2022/mimalloc-override-dll.vcxproj.filters b/third-party/mimalloc/ide/vs2022/mimalloc-override-dll.vcxproj.filters
new file mode 100644
index 0000000000..91bdf95c73
--- /dev/null
+++ b/third-party/mimalloc/ide/vs2022/mimalloc-override-dll.vcxproj.filters
@@ -0,0 +1,113 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <ClCompile Include="..\..\src\alloc.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\alloc-aligned.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\alloc-override.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\alloc-posix.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\arena.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\arena-abandoned.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\bitmap.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\heap.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\init.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\libc.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\options.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\os.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\page.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\page-queue.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\prim\windows\prim.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\prim\prim.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\random.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\segment.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\segment-map.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\stats.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\include\mimalloc\atomic.h">
+      <Filter>Headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\bitmap.h">
+      <Filter>Headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc\internal.h">
+      <Filter>Headers</Filter>
+    </ClInclude>
+    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc.h">
+      <Filter>Headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc-etw.h">
+      <Filter>Headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc-etw-gen.h">
+      <Filter>Headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc-new-delete.h">
+      <Filter>Headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc-override.h">
+      <Filter>Headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc\track.h">
+      <Filter>Headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc\types.h">
+      <Filter>Headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\include\mimalloc\prim.h">
+      <Filter>Headers</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <Filter Include="Headers">
+      <UniqueIdentifier>{262c6c21-e270-4ba6-bd63-4ac999307e4e}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Sources">
+      <UniqueIdentifier>{94b40bdc-a741-45dd-81aa-c05fabcd2970}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\include\mimalloc-etw-gen.man">
+      <Filter>Sources</Filter>
+    </None>
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/third-party/mimalloc/ide/vs2022/mimalloc-override-test.vcxproj b/third-party/mimalloc/ide/vs2022/mimalloc-override-test.vcxproj
index a3c56f7bad..427a75ae14 100644
--- a/third-party/mimalloc/ide/vs2022/mimalloc-override-test.vcxproj
+++ b/third-party/mimalloc/ide/vs2022/mimalloc-override-test.vcxproj
@@ -1,10 +1,26 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|ARM64">
+      <Configuration>Debug</Configuration>
+      <Platform>ARM64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|ARM64EC">
+      <Configuration>Debug</Configuration>
+      <Platform>ARM64EC</Platform>
+    </ProjectConfiguration>
     <ProjectConfiguration Include="Debug|Win32">
       <Configuration>Debug</Configuration>
       <Platform>Win32</Platform>
     </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|ARM64">
+      <Configuration>Release</Configuration>
+      <Platform>ARM64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|ARM64EC">
+      <Configuration>Release</Configuration>
+      <Platform>ARM64EC</Platform>
+    </ProjectConfiguration>
     <ProjectConfiguration Include="Release|Win32">
       <Configuration>Release</Configuration>
       <Platform>Win32</Platform>
@@ -23,7 +39,7 @@
     <ProjectGuid>{FEF7868F-750E-4C21-A04D-22707CC66879}</ProjectGuid>
     <RootNamespace>mimalloc-override-test</RootNamespace>
     <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
-    <ProjectName>mimalloc-override-test</ProjectName>
+    <ProjectName>mimalloc-test-override</ProjectName>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
@@ -42,12 +58,34 @@
     <UseDebugLibraries>true</UseDebugLibraries>
     <PlatformToolset>v143</PlatformToolset>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
     <PlatformToolset>v143</PlatformToolset>
     <WholeProgramOptimization>true</WholeProgramOptimization>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
   </ImportGroup>
@@ -62,9 +100,21 @@
   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
   <PropertyGroup Label="UserMacros" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
@@ -78,10 +128,26 @@
     <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
     <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">
+    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
+    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">
+    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
+    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
     <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">
+    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
+    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">
+    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
+    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
+  </PropertyGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <ClCompile>
       <WarningLevel>Level3</WarningLevel>
@@ -128,6 +194,54 @@
       </Command>
     </PostBuildEvent>
   </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <ExceptionHandling>Sync</ExceptionHandling>
+      <CompileAs>Default</CompileAs>
+      <SupportJustMyCode>false</SupportJustMyCode>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EntryPointSymbol>
+      </EntryPointSymbol>
+      <AdditionalDependencies>kernel32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+    <PostBuildEvent />
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <ExceptionHandling>Sync</ExceptionHandling>
+      <CompileAs>Default</CompileAs>
+      <SupportJustMyCode>false</SupportJustMyCode>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EntryPointSymbol>
+      </EntryPointSymbol>
+      <AdditionalDependencies>kernel32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+    <PostBuildEvent />
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
     <ClCompile>
       <WarningLevel>Level3</WarningLevel>
@@ -176,11 +290,61 @@
       </Command>
     </PostBuildEvent>
   </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <SubSystem>Console</SubSystem>
+      <EntryPointSymbol>
+      </EntryPointSymbol>
+      <AdditionalDependencies>kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <SubSystem>Console</SubSystem>
+      <EntryPointSymbol>
+      </EntryPointSymbol>
+      <AdditionalDependencies>kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
   <ItemGroup>
     <ClCompile Include="..\..\test\main-override.cpp" />
   </ItemGroup>
   <ItemGroup>
-    <ProjectReference Include="mimalloc-override.vcxproj">
+    <ProjectReference Include="mimalloc-override-dll.vcxproj">
       <Project>{abb5eae7-b3e6-432e-b636-333449892ea7}</Project>
     </ProjectReference>
   </ItemGroup>
diff --git a/third-party/mimalloc/ide/vs2022/mimalloc-test-api.vcxproj b/third-party/mimalloc/ide/vs2022/mimalloc-test-api.vcxproj
index d9b9cae4f1..b7f97ad204 100644
--- a/third-party/mimalloc/ide/vs2022/mimalloc-test-api.vcxproj
+++ b/third-party/mimalloc/ide/vs2022/mimalloc-test-api.vcxproj
@@ -1,10 +1,26 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|ARM64">
+      <Configuration>Debug</Configuration>
+      <Platform>ARM64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|ARM64EC">
+      <Configuration>Debug</Configuration>
+      <Platform>ARM64EC</Platform>
+    </ProjectConfiguration>
     <ProjectConfiguration Include="Debug|Win32">
       <Configuration>Debug</Configuration>
       <Platform>Win32</Platform>
     </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|ARM64">
+      <Configuration>Release</Configuration>
+      <Platform>ARM64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|ARM64EC">
+      <Configuration>Release</Configuration>
+      <Platform>ARM64EC</Platform>
+    </ProjectConfiguration>
     <ProjectConfiguration Include="Release|Win32">
       <Configuration>Release</Configuration>
       <Platform>Win32</Platform>
@@ -42,12 +58,34 @@
     <UseDebugLibraries>true</UseDebugLibraries>
     <PlatformToolset>v143</PlatformToolset>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
     <PlatformToolset>v143</PlatformToolset>
     <WholeProgramOptimization>true</WholeProgramOptimization>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
   </ImportGroup>
@@ -62,9 +100,21 @@
   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
   <PropertyGroup Label="UserMacros" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
@@ -78,10 +128,26 @@
     <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
     <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">
+    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
+    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">
+    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
+    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
     <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">
+    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
+    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">
+    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
+    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
+  </PropertyGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <ClCompile>
       <WarningLevel>Level3</WarningLevel>
@@ -106,6 +172,30 @@
       <SubSystem>Console</SubSystem>
     </Link>
   </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+    </Link>
+  </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
     <ClCompile>
       <WarningLevel>Level3</WarningLevel>
@@ -140,19 +230,59 @@
       <SubSystem>Console</SubSystem>
     </Link>
   </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <SubSystem>Console</SubSystem>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <SubSystem>Console</SubSystem>
+    </Link>
+  </ItemDefinitionGroup>
   <ItemGroup>
     <ClCompile Include="..\..\test\test-api-fill.c">
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">true</ExcludedFromBuild>
     </ClCompile>
     <ClCompile Include="..\..\test\test-api.c">
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">false</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">false</ExcludedFromBuild>
     </ClCompile>
   </ItemGroup>
   <ItemGroup>
-    <ProjectReference Include="mimalloc.vcxproj">
+    <ProjectReference Include="mimalloc-lib.vcxproj">
       <Project>{abb5eae7-b3e6-432e-b636-333449892ea6}</Project>
     </ProjectReference>
   </ItemGroup>
diff --git a/third-party/mimalloc/ide/vs2022/mimalloc-test-stress.vcxproj b/third-party/mimalloc/ide/vs2022/mimalloc-test-stress.vcxproj
index 14bd3e6927..cb761f9429 100644
--- a/third-party/mimalloc/ide/vs2022/mimalloc-test-stress.vcxproj
+++ b/third-party/mimalloc/ide/vs2022/mimalloc-test-stress.vcxproj
@@ -1,10 +1,26 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|ARM64">
+      <Configuration>Debug</Configuration>
+      <Platform>ARM64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|ARM64EC">
+      <Configuration>Debug</Configuration>
+      <Platform>ARM64EC</Platform>
+    </ProjectConfiguration>
     <ProjectConfiguration Include="Debug|Win32">
       <Configuration>Debug</Configuration>
       <Platform>Win32</Platform>
     </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|ARM64">
+      <Configuration>Release</Configuration>
+      <Platform>ARM64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|ARM64EC">
+      <Configuration>Release</Configuration>
+      <Platform>ARM64EC</Platform>
+    </ProjectConfiguration>
     <ProjectConfiguration Include="Release|Win32">
       <Configuration>Release</Configuration>
       <Platform>Win32</Platform>
@@ -42,12 +58,34 @@
     <UseDebugLibraries>true</UseDebugLibraries>
     <PlatformToolset>v143</PlatformToolset>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
     <PlatformToolset>v143</PlatformToolset>
     <WholeProgramOptimization>true</WholeProgramOptimization>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
   </ImportGroup>
@@ -62,9 +100,21 @@
   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
   <PropertyGroup Label="UserMacros" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
@@ -78,10 +128,26 @@
     <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
     <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">
+    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
+    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">
+    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
+    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
     <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">
+    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
+    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">
+    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
+    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
+  </PropertyGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <ClCompile>
       <WarningLevel>Level3</WarningLevel>
@@ -106,6 +172,30 @@
       <SubSystem>Console</SubSystem>
     </Link>
   </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+    </Link>
+  </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
     <ClCompile>
       <WarningLevel>Level3</WarningLevel>
@@ -140,17 +230,57 @@
       <SubSystem>Console</SubSystem>
     </Link>
   </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
+      <EnableEnhancedInstructionSet>CPUExtensionRequirementsARMv81</EnableEnhancedInstructionSet>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <SubSystem>Console</SubSystem>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
+      <EnableEnhancedInstructionSet>CPUExtensionRequirementsARMv81</EnableEnhancedInstructionSet>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <SubSystem>Console</SubSystem>
+    </Link>
+  </ItemDefinitionGroup>
   <ItemGroup>
     <ClCompile Include="..\..\test\test-stress.c">
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">false</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">false</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">false</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">false</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">false</ExcludedFromBuild>
     </ClCompile>
   </ItemGroup>
   <ItemGroup>
-    <ProjectReference Include="mimalloc-override.vcxproj">
-      <Project>{abb5eae7-b3e6-432e-b636-333449892ea7}</Project>
+    <ProjectReference Include="mimalloc-lib.vcxproj">
+      <Project>{abb5eae7-b3e6-432e-b636-333449892ea6}</Project>
     </ProjectReference>
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
diff --git a/third-party/mimalloc/ide/vs2022/mimalloc-test.vcxproj b/third-party/mimalloc/ide/vs2022/mimalloc-test.vcxproj
index 506dd7d457..83202dbed6 100644
--- a/third-party/mimalloc/ide/vs2022/mimalloc-test.vcxproj
+++ b/third-party/mimalloc/ide/vs2022/mimalloc-test.vcxproj
@@ -1,10 +1,26 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|ARM64">
+      <Configuration>Debug</Configuration>
+      <Platform>ARM64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|ARM64EC">
+      <Configuration>Debug</Configuration>
+      <Platform>ARM64EC</Platform>
+    </ProjectConfiguration>
     <ProjectConfiguration Include="Debug|Win32">
       <Configuration>Debug</Configuration>
       <Platform>Win32</Platform>
     </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|ARM64">
+      <Configuration>Release</Configuration>
+      <Platform>ARM64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|ARM64EC">
+      <Configuration>Release</Configuration>
+      <Platform>ARM64EC</Platform>
+    </ProjectConfiguration>
     <ProjectConfiguration Include="Release|Win32">
       <Configuration>Release</Configuration>
       <Platform>Win32</Platform>
@@ -23,7 +39,7 @@
     <ProjectGuid>{FEF7858F-750E-4C21-A04D-22707CC66878}</ProjectGuid>
     <RootNamespace>mimalloctest</RootNamespace>
     <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
-    <ProjectName>mimalloc-test</ProjectName>
+    <ProjectName>mimalloc-test-static</ProjectName>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
@@ -42,12 +58,34 @@
     <UseDebugLibraries>true</UseDebugLibraries>
     <PlatformToolset>v143</PlatformToolset>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
     <PlatformToolset>v143</PlatformToolset>
     <WholeProgramOptimization>true</WholeProgramOptimization>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
   </ImportGroup>
@@ -62,9 +100,21 @@
   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
   </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
   <PropertyGroup Label="UserMacros" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
@@ -78,10 +128,26 @@
     <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
     <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">
+    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
+    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">
+    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
+    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
+  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
     <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">
+    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
+    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">
+    <OutDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\</OutDir>
+    <IntDir>$(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
+  </PropertyGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <ClCompile>
       <WarningLevel>Level3</WarningLevel>
@@ -108,6 +174,32 @@
       <SubSystem>Console</SubSystem>
     </Link>
   </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64EC'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+    </Link>
+  </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
     <ClCompile>
       <WarningLevel>Level3</WarningLevel>
@@ -144,13 +236,49 @@
       <SubSystem>Console</SubSystem>
     </Link>
   </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <SubSystem>Console</SubSystem>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64EC'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <SubSystem>Console</SubSystem>
+    </Link>
+  </ItemDefinitionGroup>
   <ItemGroup>
-    <ProjectReference Include="mimalloc.vcxproj">
-      <Project>{abb5eae7-b3e6-432e-b636-333449892ea6}</Project>
-    </ProjectReference>
+    <ClCompile Include="..\..\test\main-override-static.c" />
   </ItemGroup>
   <ItemGroup>
-    <ClCompile Include="..\..\test\main-override-static.c" />
+    <ProjectReference Include="mimalloc-lib.vcxproj">
+      <Project>{abb5eae7-b3e6-432e-b636-333449892ea6}</Project>
+    </ProjectReference>
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
diff --git a/third-party/mimalloc/ide/vs2022/mimalloc.sln b/third-party/mimalloc/ide/vs2022/mimalloc.sln
index 6ff01d3b44..040af3aca4 100644
--- a/third-party/mimalloc/ide/vs2022/mimalloc.sln
+++ b/third-party/mimalloc/ide/vs2022/mimalloc.sln
@@ -1,81 +1,133 @@
-﻿
-Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio Version 16
-VisualStudioVersion = 16.0.29709.97
-MinimumVisualStudioVersion = 10.0.40219.1
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc", "mimalloc.vcxproj", "{ABB5EAE7-B3E6-432E-B636-333449892EA6}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc-test", "mimalloc-test.vcxproj", "{FEF7858F-750E-4C21-A04D-22707CC66878}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc-override", "mimalloc-override.vcxproj", "{ABB5EAE7-B3E6-432E-B636-333449892EA7}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc-override-test", "mimalloc-override-test.vcxproj", "{FEF7868F-750E-4C21-A04D-22707CC66879}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc-test-stress", "mimalloc-test-stress.vcxproj", "{FEF7958F-750E-4C21-A04D-22707CC66878}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc-test-api", "mimalloc-test-api.vcxproj", "{FFF7958F-750E-4C21-A04D-22707CC66878}"
-EndProject
-Global
-	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug|x64 = Debug|x64
-		Debug|x86 = Debug|x86
-		Release|x64 = Release|x64
-		Release|x86 = Release|x86
-	EndGlobalSection
-	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Debug|x64.ActiveCfg = Debug|x64
-		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Debug|x64.Build.0 = Debug|x64
-		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Debug|x86.ActiveCfg = Debug|Win32
-		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Debug|x86.Build.0 = Debug|Win32
-		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Release|x64.ActiveCfg = Release|x64
-		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Release|x64.Build.0 = Release|x64
-		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Release|x86.ActiveCfg = Release|Win32
-		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Release|x86.Build.0 = Release|Win32
-		{FEF7858F-750E-4C21-A04D-22707CC66878}.Debug|x64.ActiveCfg = Debug|x64
-		{FEF7858F-750E-4C21-A04D-22707CC66878}.Debug|x64.Build.0 = Debug|x64
-		{FEF7858F-750E-4C21-A04D-22707CC66878}.Debug|x86.ActiveCfg = Debug|Win32
-		{FEF7858F-750E-4C21-A04D-22707CC66878}.Debug|x86.Build.0 = Debug|Win32
-		{FEF7858F-750E-4C21-A04D-22707CC66878}.Release|x64.ActiveCfg = Release|x64
-		{FEF7858F-750E-4C21-A04D-22707CC66878}.Release|x64.Build.0 = Release|x64
-		{FEF7858F-750E-4C21-A04D-22707CC66878}.Release|x86.ActiveCfg = Release|Win32
-		{FEF7858F-750E-4C21-A04D-22707CC66878}.Release|x86.Build.0 = Release|Win32
-		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Debug|x64.ActiveCfg = Debug|x64
-		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Debug|x64.Build.0 = Debug|x64
-		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Debug|x86.ActiveCfg = Debug|Win32
-		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Debug|x86.Build.0 = Debug|Win32
-		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Release|x64.ActiveCfg = Release|x64
-		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Release|x64.Build.0 = Release|x64
-		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Release|x86.ActiveCfg = Release|Win32
-		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Release|x86.Build.0 = Release|Win32
-		{FEF7868F-750E-4C21-A04D-22707CC66879}.Debug|x64.ActiveCfg = Debug|x64
-		{FEF7868F-750E-4C21-A04D-22707CC66879}.Debug|x64.Build.0 = Debug|x64
-		{FEF7868F-750E-4C21-A04D-22707CC66879}.Debug|x86.ActiveCfg = Debug|Win32
-		{FEF7868F-750E-4C21-A04D-22707CC66879}.Debug|x86.Build.0 = Debug|Win32
-		{FEF7868F-750E-4C21-A04D-22707CC66879}.Release|x64.ActiveCfg = Release|x64
-		{FEF7868F-750E-4C21-A04D-22707CC66879}.Release|x64.Build.0 = Release|x64
-		{FEF7868F-750E-4C21-A04D-22707CC66879}.Release|x86.ActiveCfg = Release|Win32
-		{FEF7868F-750E-4C21-A04D-22707CC66879}.Release|x86.Build.0 = Release|Win32
-		{FEF7958F-750E-4C21-A04D-22707CC66878}.Debug|x64.ActiveCfg = Debug|x64
-		{FEF7958F-750E-4C21-A04D-22707CC66878}.Debug|x64.Build.0 = Debug|x64
-		{FEF7958F-750E-4C21-A04D-22707CC66878}.Debug|x86.ActiveCfg = Debug|Win32
-		{FEF7958F-750E-4C21-A04D-22707CC66878}.Debug|x86.Build.0 = Debug|Win32
-		{FEF7958F-750E-4C21-A04D-22707CC66878}.Release|x64.ActiveCfg = Release|x64
-		{FEF7958F-750E-4C21-A04D-22707CC66878}.Release|x64.Build.0 = Release|x64
-		{FEF7958F-750E-4C21-A04D-22707CC66878}.Release|x86.ActiveCfg = Release|Win32
-		{FEF7958F-750E-4C21-A04D-22707CC66878}.Release|x86.Build.0 = Release|Win32
-		{FFF7958F-750E-4C21-A04D-22707CC66878}.Debug|x64.ActiveCfg = Debug|x64
-		{FFF7958F-750E-4C21-A04D-22707CC66878}.Debug|x64.Build.0 = Debug|x64
-		{FFF7958F-750E-4C21-A04D-22707CC66878}.Debug|x86.ActiveCfg = Debug|Win32
-		{FFF7958F-750E-4C21-A04D-22707CC66878}.Debug|x86.Build.0 = Debug|Win32
-		{FFF7958F-750E-4C21-A04D-22707CC66878}.Release|x64.ActiveCfg = Release|x64
-		{FFF7958F-750E-4C21-A04D-22707CC66878}.Release|x64.Build.0 = Release|x64
-		{FFF7958F-750E-4C21-A04D-22707CC66878}.Release|x86.ActiveCfg = Release|Win32
-		{FFF7958F-750E-4C21-A04D-22707CC66878}.Release|x86.Build.0 = Release|Win32
-	EndGlobalSection
-	GlobalSection(SolutionProperties) = preSolution
-		HideSolutionNode = FALSE
-	EndGlobalSection
-	GlobalSection(ExtensibilityGlobals) = postSolution
-		SolutionGuid = {4297F93D-486A-4243-995F-7D32F59AE82A}
-	EndGlobalSection
-EndGlobal
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 17
+VisualStudioVersion = 17.12.35527.113
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc-lib", "mimalloc-lib.vcxproj", "{ABB5EAE7-B3E6-432E-B636-333449892EA6}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc-test", "mimalloc-test.vcxproj", "{FEF7858F-750E-4C21-A04D-22707CC66878}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc-override-dll", "mimalloc-override-dll.vcxproj", "{ABB5EAE7-B3E6-432E-B636-333449892EA7}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc-override-test", "mimalloc-override-test.vcxproj", "{FEF7868F-750E-4C21-A04D-22707CC66879}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc-test-stress", "mimalloc-test-stress.vcxproj", "{FEF7958F-750E-4C21-A04D-22707CC66878}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc-test-api", "mimalloc-test-api.vcxproj", "{FFF7958F-750E-4C21-A04D-22707CC66878}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|ARM64 = Debug|ARM64
+		Debug|ARM64EC = Debug|ARM64EC
+		Debug|x64 = Debug|x64
+		Debug|x86 = Debug|x86
+		Release|ARM64 = Release|ARM64
+		Release|ARM64EC = Release|ARM64EC
+		Release|x64 = Release|x64
+		Release|x86 = Release|x86
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Debug|ARM64.ActiveCfg = Debug|ARM64
+		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Debug|ARM64.Build.0 = Debug|ARM64
+		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Debug|ARM64EC.ActiveCfg = Debug|ARM64EC
+		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Debug|ARM64EC.Build.0 = Debug|ARM64EC
+		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Debug|x64.ActiveCfg = Debug|x64
+		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Debug|x64.Build.0 = Debug|x64
+		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Debug|x86.ActiveCfg = Debug|Win32
+		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Debug|x86.Build.0 = Debug|Win32
+		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Release|ARM64.ActiveCfg = Release|ARM64
+		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Release|ARM64.Build.0 = Release|ARM64
+		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Release|ARM64EC.ActiveCfg = Release|ARM64EC
+		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Release|ARM64EC.Build.0 = Release|ARM64EC
+		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Release|x64.ActiveCfg = Release|x64
+		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Release|x64.Build.0 = Release|x64
+		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Release|x86.ActiveCfg = Release|Win32
+		{ABB5EAE7-B3E6-432E-B636-333449892EA6}.Release|x86.Build.0 = Release|Win32
+		{FEF7858F-750E-4C21-A04D-22707CC66878}.Debug|ARM64.ActiveCfg = Debug|ARM64
+		{FEF7858F-750E-4C21-A04D-22707CC66878}.Debug|ARM64.Build.0 = Debug|ARM64
+		{FEF7858F-750E-4C21-A04D-22707CC66878}.Debug|ARM64EC.ActiveCfg = Debug|ARM64EC
+		{FEF7858F-750E-4C21-A04D-22707CC66878}.Debug|ARM64EC.Build.0 = Debug|ARM64EC
+		{FEF7858F-750E-4C21-A04D-22707CC66878}.Debug|x64.ActiveCfg = Debug|x64
+		{FEF7858F-750E-4C21-A04D-22707CC66878}.Debug|x64.Build.0 = Debug|x64
+		{FEF7858F-750E-4C21-A04D-22707CC66878}.Debug|x86.ActiveCfg = Debug|Win32
+		{FEF7858F-750E-4C21-A04D-22707CC66878}.Debug|x86.Build.0 = Debug|Win32
+		{FEF7858F-750E-4C21-A04D-22707CC66878}.Release|ARM64.ActiveCfg = Release|ARM64
+		{FEF7858F-750E-4C21-A04D-22707CC66878}.Release|ARM64.Build.0 = Release|ARM64
+		{FEF7858F-750E-4C21-A04D-22707CC66878}.Release|ARM64EC.ActiveCfg = Release|ARM64EC
+		{FEF7858F-750E-4C21-A04D-22707CC66878}.Release|ARM64EC.Build.0 = Release|ARM64EC
+		{FEF7858F-750E-4C21-A04D-22707CC66878}.Release|x64.ActiveCfg = Release|x64
+		{FEF7858F-750E-4C21-A04D-22707CC66878}.Release|x64.Build.0 = Release|x64
+		{FEF7858F-750E-4C21-A04D-22707CC66878}.Release|x86.ActiveCfg = Release|Win32
+		{FEF7858F-750E-4C21-A04D-22707CC66878}.Release|x86.Build.0 = Release|Win32
+		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Debug|ARM64.ActiveCfg = Debug|ARM64
+		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Debug|ARM64.Build.0 = Debug|ARM64
+		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Debug|ARM64EC.ActiveCfg = Debug|ARM64EC
+		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Debug|ARM64EC.Build.0 = Debug|ARM64EC
+		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Debug|x64.ActiveCfg = Debug|x64
+		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Debug|x64.Build.0 = Debug|x64
+		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Debug|x86.ActiveCfg = Debug|Win32
+		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Debug|x86.Build.0 = Debug|Win32
+		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Release|ARM64.ActiveCfg = Release|ARM64
+		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Release|ARM64.Build.0 = Release|ARM64
+		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Release|ARM64EC.ActiveCfg = Release|ARM64EC
+		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Release|ARM64EC.Build.0 = Release|ARM64EC
+		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Release|x64.ActiveCfg = Release|x64
+		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Release|x64.Build.0 = Release|x64
+		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Release|x86.ActiveCfg = Release|Win32
+		{ABB5EAE7-B3E6-432E-B636-333449892EA7}.Release|x86.Build.0 = Release|Win32
+		{FEF7868F-750E-4C21-A04D-22707CC66879}.Debug|ARM64.ActiveCfg = Debug|ARM64
+		{FEF7868F-750E-4C21-A04D-22707CC66879}.Debug|ARM64.Build.0 = Debug|ARM64
+		{FEF7868F-750E-4C21-A04D-22707CC66879}.Debug|ARM64EC.ActiveCfg = Debug|ARM64EC
+		{FEF7868F-750E-4C21-A04D-22707CC66879}.Debug|ARM64EC.Build.0 = Debug|ARM64EC
+		{FEF7868F-750E-4C21-A04D-22707CC66879}.Debug|x64.ActiveCfg = Debug|x64
+		{FEF7868F-750E-4C21-A04D-22707CC66879}.Debug|x64.Build.0 = Debug|x64
+		{FEF7868F-750E-4C21-A04D-22707CC66879}.Debug|x86.ActiveCfg = Debug|Win32
+		{FEF7868F-750E-4C21-A04D-22707CC66879}.Debug|x86.Build.0 = Debug|Win32
+		{FEF7868F-750E-4C21-A04D-22707CC66879}.Release|ARM64.ActiveCfg = Release|ARM64
+		{FEF7868F-750E-4C21-A04D-22707CC66879}.Release|ARM64.Build.0 = Release|ARM64
+		{FEF7868F-750E-4C21-A04D-22707CC66879}.Release|ARM64EC.ActiveCfg = Release|ARM64EC
+		{FEF7868F-750E-4C21-A04D-22707CC66879}.Release|ARM64EC.Build.0 = Release|ARM64EC
+		{FEF7868F-750E-4C21-A04D-22707CC66879}.Release|x64.ActiveCfg = Release|x64
+		{FEF7868F-750E-4C21-A04D-22707CC66879}.Release|x64.Build.0 = Release|x64
+		{FEF7868F-750E-4C21-A04D-22707CC66879}.Release|x86.ActiveCfg = Release|Win32
+		{FEF7868F-750E-4C21-A04D-22707CC66879}.Release|x86.Build.0 = Release|Win32
+		{FEF7958F-750E-4C21-A04D-22707CC66878}.Debug|ARM64.ActiveCfg = Debug|ARM64
+		{FEF7958F-750E-4C21-A04D-22707CC66878}.Debug|ARM64.Build.0 = Debug|ARM64
+		{FEF7958F-750E-4C21-A04D-22707CC66878}.Debug|ARM64EC.ActiveCfg = Debug|ARM64EC
+		{FEF7958F-750E-4C21-A04D-22707CC66878}.Debug|ARM64EC.Build.0 = Debug|ARM64EC
+		{FEF7958F-750E-4C21-A04D-22707CC66878}.Debug|x64.ActiveCfg = Debug|x64
+		{FEF7958F-750E-4C21-A04D-22707CC66878}.Debug|x64.Build.0 = Debug|x64
+		{FEF7958F-750E-4C21-A04D-22707CC66878}.Debug|x86.ActiveCfg = Debug|Win32
+		{FEF7958F-750E-4C21-A04D-22707CC66878}.Debug|x86.Build.0 = Debug|Win32
+		{FEF7958F-750E-4C21-A04D-22707CC66878}.Release|ARM64.ActiveCfg = Release|ARM64
+		{FEF7958F-750E-4C21-A04D-22707CC66878}.Release|ARM64.Build.0 = Release|ARM64
+		{FEF7958F-750E-4C21-A04D-22707CC66878}.Release|ARM64EC.ActiveCfg = Release|ARM64EC
+		{FEF7958F-750E-4C21-A04D-22707CC66878}.Release|ARM64EC.Build.0 = Release|ARM64EC
+		{FEF7958F-750E-4C21-A04D-22707CC66878}.Release|x64.ActiveCfg = Release|x64
+		{FEF7958F-750E-4C21-A04D-22707CC66878}.Release|x64.Build.0 = Release|x64
+		{FEF7958F-750E-4C21-A04D-22707CC66878}.Release|x86.ActiveCfg = Release|Win32
+		{FEF7958F-750E-4C21-A04D-22707CC66878}.Release|x86.Build.0 = Release|Win32
+		{FFF7958F-750E-4C21-A04D-22707CC66878}.Debug|ARM64.ActiveCfg = Debug|ARM64
+		{FFF7958F-750E-4C21-A04D-22707CC66878}.Debug|ARM64.Build.0 = Debug|ARM64
+		{FFF7958F-750E-4C21-A04D-22707CC66878}.Debug|ARM64EC.ActiveCfg = Debug|ARM64EC
+		{FFF7958F-750E-4C21-A04D-22707CC66878}.Debug|ARM64EC.Build.0 = Debug|ARM64EC
+		{FFF7958F-750E-4C21-A04D-22707CC66878}.Debug|x64.ActiveCfg = Debug|x64
+		{FFF7958F-750E-4C21-A04D-22707CC66878}.Debug|x64.Build.0 = Debug|x64
+		{FFF7958F-750E-4C21-A04D-22707CC66878}.Debug|x86.ActiveCfg = Debug|Win32
+		{FFF7958F-750E-4C21-A04D-22707CC66878}.Debug|x86.Build.0 = Debug|Win32
+		{FFF7958F-750E-4C21-A04D-22707CC66878}.Release|ARM64.ActiveCfg = Release|ARM64
+		{FFF7958F-750E-4C21-A04D-22707CC66878}.Release|ARM64.Build.0 = Release|ARM64
+		{FFF7958F-750E-4C21-A04D-22707CC66878}.Release|ARM64EC.ActiveCfg = Release|ARM64EC
+		{FFF7958F-750E-4C21-A04D-22707CC66878}.Release|ARM64EC.Build.0 = Release|ARM64EC
+		{FFF7958F-750E-4C21-A04D-22707CC66878}.Release|x64.ActiveCfg = Release|x64
+		{FFF7958F-750E-4C21-A04D-22707CC66878}.Release|x64.Build.0 = Release|x64
+		{FFF7958F-750E-4C21-A04D-22707CC66878}.Release|x86.ActiveCfg = Release|Win32
+		{FFF7958F-750E-4C21-A04D-22707CC66878}.Release|x86.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {4297F93D-486A-4243-995F-7D32F59AE82A}
+	EndGlobalSection
+EndGlobal
diff --git a/third-party/mimalloc/include/mimalloc.h b/third-party/mimalloc/include/mimalloc.h
index c41bcc8039..bd91db4338 100644
--- a/third-party/mimalloc/include/mimalloc.h
+++ b/third-party/mimalloc/include/mimalloc.h
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_H
 #define MIMALLOC_H
 
-#define MI_MALLOC_VERSION 217   // major + 2 digits minor
+#define MI_MALLOC_VERSION 219   // major + 2 digits minor
 
 // ------------------------------------------------------
 // Compiler specific attributes
@@ -149,6 +149,7 @@ typedef void (mi_cdecl mi_error_fun)(int err, void* arg);
 mi_decl_export void mi_register_error(mi_error_fun* fun, void* arg);
 
 mi_decl_export void mi_collect(bool force)    mi_attr_noexcept;
+mi_decl_export void mi_collect_reduce(size_t target_thread_owned) mi_attr_noexcept;
 mi_decl_export int  mi_version(void)          mi_attr_noexcept;
 mi_decl_export void mi_stats_reset(void)      mi_attr_noexcept;
 mi_decl_export void mi_stats_merge(void)      mi_attr_noexcept;
@@ -259,11 +260,12 @@ typedef struct mi_heap_area_s {
   size_t used;        // number of allocated blocks
   size_t block_size;  // size in bytes of each block
   size_t full_block_size; // size in bytes of a full block including padding and metadata.
+  int    heap_tag;    // heap tag associated with this area
 } mi_heap_area_t;
 
 typedef bool (mi_cdecl mi_block_visit_fun)(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg);
 
-mi_decl_export bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_all_blocks, mi_block_visit_fun* visitor, void* arg);
+mi_decl_export bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_blocks, mi_block_visit_fun* visitor, void* arg);
 
 // Experimental
 mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept;
@@ -275,7 +277,7 @@ mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size
 mi_decl_export int  mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept;
 mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept;
 
-mi_decl_export void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept;
+mi_decl_export void mi_debug_show_arenas(bool show_inuse) mi_attr_noexcept;
 
 // Experimental: heaps associated with specific memory arena's
 typedef int mi_arena_id_t;
@@ -289,8 +291,31 @@ mi_decl_export bool  mi_manage_os_memory_ex(void* start, size_t size, bool is_co
 mi_decl_nodiscard mi_decl_export mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id);
 #endif
 
+
+// Experimental: allow sub-processes whose memory segments stay separated (and no reclamation between them)
+// Used for example for separate interpreter's in one process.
+typedef void* mi_subproc_id_t;
+mi_decl_export mi_subproc_id_t mi_subproc_main(void);
+mi_decl_export mi_subproc_id_t mi_subproc_new(void);
+mi_decl_export void mi_subproc_delete(mi_subproc_id_t subproc);
+mi_decl_export void mi_subproc_add_current_thread(mi_subproc_id_t subproc); // this should be called right after a thread is created (and no allocation has taken place yet)
+
+// Experimental: visit abandoned heap areas (from threads that have been terminated)
+mi_decl_export bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg);
+
+// Experimental: create a new heap with a specified heap tag. Set `allow_destroy` to false to allow the thread
+// to reclaim abandoned memory (with a compatible heap_tag and arena_id) but in that case `mi_heap_destroy` will
+// fall back to `mi_heap_delete`.
+mi_decl_nodiscard mi_decl_export mi_heap_t* mi_heap_new_ex(int heap_tag, bool allow_destroy, mi_arena_id_t arena_id);
+
 // deprecated
-mi_decl_export int  mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept;
+mi_decl_export int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept;
+
+// Experimental: objects followed by a guard page.
+// A sample rate of 0 disables guarded objects, while 1 uses a guard page for every object.
+// A seed of 0 uses a random start point. Only objects within the size bound are eligable for guard pages.
+mi_decl_export void mi_heap_guarded_set_sample_rate(mi_heap_t* heap, size_t sample_rate, size_t seed);
+mi_decl_export void mi_heap_guarded_set_size_bound(mi_heap_t* heap, size_t min, size_t max);
 
 
 // ------------------------------------------------------
@@ -332,7 +357,7 @@ typedef enum mi_option_e {
   mi_option_deprecated_segment_cache,
   mi_option_deprecated_page_reset,
   mi_option_abandoned_page_purge,       // immediately purge delayed purges on thread termination
-  mi_option_deprecated_segment_reset, 
+  mi_option_deprecated_segment_reset,
   mi_option_eager_commit_delay,         // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
   mi_option_purge_delay,                // memory purging is delayed by N milli seconds; use 0 for immediate purging or -1 for no purging at all. (=10)
   mi_option_use_numa_nodes,             // 0 = use all available numa nodes, otherwise use at most N nodes.
@@ -348,6 +373,13 @@ typedef enum mi_option_e {
   mi_option_abandoned_reclaim_on_free,  // allow to reclaim an abandoned segment on a free (=1)
   mi_option_disallow_arena_alloc,       // 1 = do not use arena's for allocation (except if using specific arena id's)
   mi_option_retry_on_oom,               // retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries. (only on windows)
+  mi_option_visit_abandoned,            // allow visiting heap blocks from abandoned threads (=0)
+  mi_option_guarded_min,                // only used when building with MI_GUARDED: minimal rounded object size for guarded objects (=0)
+  mi_option_guarded_max,                // only used when building with MI_GUARDED: maximal rounded object size for guarded objects (=0)
+  mi_option_guarded_precise,            // disregard minimal alignment requirement to always place guarded blocks exactly in front of a guard page (=0)
+  mi_option_guarded_sample_rate,        // 1 out of N allocations in the min/max range will be guarded (=1000)
+  mi_option_guarded_sample_seed,        // can be set to allow for a (more) deterministic re-execution when a guard page is triggered (=0)
+  mi_option_target_segments_per_thread, // experimental (=0)
   _mi_option_last,
   // legacy option names
   mi_option_large_os_pages = mi_option_allow_large_os_pages,
diff --git a/third-party/mimalloc/include/mimalloc/atomic.h b/third-party/mimalloc/include/mimalloc/atomic.h
index d5333dd90f..dbd7160cf1 100644
--- a/third-party/mimalloc/include/mimalloc/atomic.h
+++ b/third-party/mimalloc/include/mimalloc/atomic.h
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2023 Microsoft Research, Daan Leijen
+Copyright (c) 2018-2024 Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -8,10 +8,21 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_ATOMIC_H
 #define MIMALLOC_ATOMIC_H
 
+// include windows.h or pthreads.h
+#if defined(_WIN32)
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>
+#elif !defined(__wasi__) && (!defined(__EMSCRIPTEN__) || defined(__EMSCRIPTEN_PTHREADS__))
+#define  MI_USE_PTHREADS
+#include <pthread.h>
+#endif
+
 // --------------------------------------------------------------------------------------------
 // Atomics
 // We need to be portable between C, C++, and MSVC.
-// We base the primitives on the C/C++ atomics and create a mimimal wrapper for MSVC in C compilation mode.
+// We base the primitives on the C/C++ atomics and create a minimal wrapper for MSVC in C compilation mode.
 // This is why we try to use only `uintptr_t` and `<type>*` as atomic types.
 // To gain better insight in the range of used atomics, we use explicitly named memory order operations
 // instead of passing the memory order as a parameter.
@@ -20,33 +31,33 @@ terms of the MIT license. A copy of the license can be found in the file
 #if defined(__cplusplus)
 // Use C++ atomics
 #include <atomic>
-#define  _Atomic(tp)            std::atomic<tp>
-#define  mi_atomic(name)        std::atomic_##name
-#define  mi_memory_order(name)  std::memory_order_##name
-#if (__cplusplus >= 202002L)    // c++20, see issue #571
-#define MI_ATOMIC_VAR_INIT(x)  x
+#define  _Atomic(tp)              std::atomic<tp>
+#define  mi_atomic(name)          std::atomic_##name
+#define  mi_memory_order(name)    std::memory_order_##name
+#if (__cplusplus >= 202002L)      // c++20, see issue #571
+ #define MI_ATOMIC_VAR_INIT(x)    x
 #elif !defined(ATOMIC_VAR_INIT)
-#define MI_ATOMIC_VAR_INIT(x)  x
+ #define MI_ATOMIC_VAR_INIT(x)    x
 #else
- #define MI_ATOMIC_VAR_INIT(x)  ATOMIC_VAR_INIT(x)
+ #define MI_ATOMIC_VAR_INIT(x)    ATOMIC_VAR_INIT(x)
 #endif
 #elif defined(_MSC_VER)
 // Use MSVC C wrapper for C11 atomics
-#define  _Atomic(tp)            tp
-#define  MI_ATOMIC_VAR_INIT(x)  x
-#define  mi_atomic(name)        mi_atomic_##name
-#define  mi_memory_order(name)  mi_memory_order_##name
+#define  _Atomic(tp)              tp
+#define  MI_ATOMIC_VAR_INIT(x)    x
+#define  mi_atomic(name)          mi_atomic_##name
+#define  mi_memory_order(name)    mi_memory_order_##name
 #else
 // Use C11 atomics
 #include <stdatomic.h>
-#define  mi_atomic(name)        atomic_##name
-#define  mi_memory_order(name)  memory_order_##name
+#define  mi_atomic(name)          atomic_##name
+#define  mi_memory_order(name)    memory_order_##name
 #if (__STDC_VERSION__ >= 201710L) // c17, see issue #735
- #define MI_ATOMIC_VAR_INIT(x) x
+ #define MI_ATOMIC_VAR_INIT(x)    x
 #elif !defined(ATOMIC_VAR_INIT)
- #define MI_ATOMIC_VAR_INIT(x) x
+ #define MI_ATOMIC_VAR_INIT(x)    x
 #else
- #define MI_ATOMIC_VAR_INIT(x) ATOMIC_VAR_INIT(x)
+ #define MI_ATOMIC_VAR_INIT(x)    ATOMIC_VAR_INIT(x)
 #endif
 #endif
 
@@ -61,6 +72,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #define mi_atomic_load_relaxed(p)                mi_atomic(load_explicit)(p,mi_memory_order(relaxed))
 #define mi_atomic_store_release(p,x)             mi_atomic(store_explicit)(p,x,mi_memory_order(release))
 #define mi_atomic_store_relaxed(p,x)             mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed))
+#define mi_atomic_exchange_relaxed(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_exchange_release(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(release))
 #define mi_atomic_exchange_acq_rel(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(acq_rel))
 #define mi_atomic_cas_weak_release(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
@@ -99,6 +111,7 @@ static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub);
 #define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release(p,exp,(tp*)des)
 #define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel(p,exp,(tp*)des)
 #define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release(p,exp,(tp*)des)
+#define mi_atomic_exchange_ptr_relaxed(tp,p,x)          mi_atomic_exchange_relaxed(p,(tp*)x)
 #define mi_atomic_exchange_ptr_release(tp,p,x)          mi_atomic_exchange_release(p,(tp*)x)
 #define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          mi_atomic_exchange_acq_rel(p,(tp*)x)
 #else
@@ -107,6 +120,7 @@ static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub);
 #define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release(p,exp,des)
 #define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel(p,exp,des)
 #define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release(p,exp,des)
+#define mi_atomic_exchange_ptr_relaxed(tp,p,x)          mi_atomic_exchange_relaxed(p,x)
 #define mi_atomic_exchange_ptr_release(tp,p,x)          mi_atomic_exchange_release(p,x)
 #define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          mi_atomic_exchange_acq_rel(p,x)
 #endif
@@ -133,10 +147,6 @@ static inline void mi_atomic_maxi64_relaxed(volatile int64_t* p, int64_t x) {
 #elif defined(_MSC_VER)
 
 // Legacy MSVC plain C compilation wrapper that uses Interlocked operations to model C11 atomics.
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif
-#include <windows.h>
 #include <intrin.h>
 #ifdef _WIN64
 typedef LONG64   msc_intptr_t;
@@ -280,6 +290,7 @@ static inline bool mi_atomic_casi64_strong_acq_rel(volatile _Atomic(int64_t*)p,
 #define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
 #define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
 #define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
+#define mi_atomic_exchange_ptr_relaxed(tp,p,x)          (tp*)mi_atomic_exchange_relaxed((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
 #define mi_atomic_exchange_ptr_release(tp,p,x)          (tp*)mi_atomic_exchange_release((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
 #define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          (tp*)mi_atomic_exchange_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
 
@@ -302,11 +313,16 @@ static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub) {
   return (intptr_t)mi_atomic_addi(p, -sub);
 }
 
+
+// ----------------------------------------------------------------------
+// Once and Guard
+// ----------------------------------------------------------------------
+
 typedef _Atomic(uintptr_t) mi_atomic_once_t;
 
 // Returns true only on the first invocation
 static inline bool mi_atomic_once( mi_atomic_once_t* once ) {
-  if (mi_atomic_load_relaxed(once) != 0) return false;     // quick test 
+  if (mi_atomic_load_relaxed(once) != 0) return false;     // quick test
   uintptr_t expected = 0;
   return mi_atomic_cas_strong_acq_rel(once, &expected, (uintptr_t)1); // try to set to 1
 }
@@ -322,17 +338,16 @@ typedef _Atomic(uintptr_t) mi_atomic_guard_t;
 
 
 
+// ----------------------------------------------------------------------
 // Yield
+// ----------------------------------------------------------------------
+
 #if defined(__cplusplus)
 #include <thread>
 static inline void mi_atomic_yield(void) {
   std::this_thread::yield();
 }
 #elif defined(_WIN32)
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif
-#include <windows.h>
 static inline void mi_atomic_yield(void) {
   YieldProcessor();
 }
@@ -390,4 +405,134 @@ static inline void mi_atomic_yield(void) {
 #endif
 
 
+// ----------------------------------------------------------------------
+// Locks 
+// These do not have to be recursive and should be light-weight 
+// in-process only locks. Only used for reserving arena's and to 
+// maintain the abandoned list.
+// ----------------------------------------------------------------------
+#if _MSC_VER
+#pragma warning(disable:26110)  // unlock with holding lock
+#endif
+
+#define mi_lock(lock)    for(bool _go = (mi_lock_acquire(lock),true); _go; (mi_lock_release(lock), _go=false) )
+
+#if defined(_WIN32)
+
+#if 1
+#define mi_lock_t  SRWLOCK   // slim reader-writer lock
+
+static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
+  return TryAcquireSRWLockExclusive(lock);
+}
+static inline void mi_lock_acquire(mi_lock_t* lock) {
+  AcquireSRWLockExclusive(lock);
+}
+static inline void mi_lock_release(mi_lock_t* lock) {
+  ReleaseSRWLockExclusive(lock);
+}
+static inline void mi_lock_init(mi_lock_t* lock) {
+  InitializeSRWLock(lock);
+}
+static inline void mi_lock_done(mi_lock_t* lock) {
+  (void)(lock);
+}
+
+#else
+#define mi_lock_t  CRITICAL_SECTION
+
+static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
+  return TryEnterCriticalSection(lock);
+}
+static inline void mi_lock_acquire(mi_lock_t* lock) {
+  EnterCriticalSection(lock);
+}
+static inline void mi_lock_release(mi_lock_t* lock) {
+  LeaveCriticalSection(lock);
+}
+static inline void mi_lock_init(mi_lock_t* lock) {
+  InitializeCriticalSection(lock);
+}
+static inline void mi_lock_done(mi_lock_t* lock) {
+  DeleteCriticalSection(lock);
+}
+
+#endif
+
+#elif defined(MI_USE_PTHREADS)
+
+void _mi_error_message(int err, const char* fmt, ...);
+
+#define mi_lock_t  pthread_mutex_t
+
+static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
+  return (pthread_mutex_trylock(lock) == 0);
+}
+static inline void mi_lock_acquire(mi_lock_t* lock) {
+  const int err = pthread_mutex_lock(lock);
+  if (err != 0) {
+    _mi_error_message(err, "internal error: lock cannot be acquired\n");
+  }
+}
+static inline void mi_lock_release(mi_lock_t* lock) {
+  pthread_mutex_unlock(lock);
+}
+static inline void mi_lock_init(mi_lock_t* lock) {
+  pthread_mutex_init(lock, NULL);
+}
+static inline void mi_lock_done(mi_lock_t* lock) {
+  pthread_mutex_destroy(lock);
+}
+
+#elif defined(__cplusplus)
+
+#include <mutex>
+#define mi_lock_t  std::mutex
+
+static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
+  return lock->try_lock();
+}
+static inline void mi_lock_acquire(mi_lock_t* lock) {
+  lock->lock();
+}
+static inline void mi_lock_release(mi_lock_t* lock) {
+  lock->unlock();
+}
+static inline void mi_lock_init(mi_lock_t* lock) {
+  (void)(lock);
+}
+static inline void mi_lock_done(mi_lock_t* lock) {
+  (void)(lock);
+}
+
+#else
+
+// fall back to poor man's locks.
+// this should only be the case in a single-threaded environment (like __wasi__)
+
+#define mi_lock_t  _Atomic(uintptr_t)
+
+static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
+  uintptr_t expected = 0;
+  return mi_atomic_cas_strong_acq_rel(lock, &expected, (uintptr_t)1);
+}
+static inline void mi_lock_acquire(mi_lock_t* lock) {
+  for (int i = 0; i < 1000; i++) {  // for at most 1000 tries?
+    if (mi_lock_try_acquire(lock)) return;
+    mi_atomic_yield();
+  }
+}
+static inline void mi_lock_release(mi_lock_t* lock) {
+  mi_atomic_store_release(lock, (uintptr_t)0);
+}
+static inline void mi_lock_init(mi_lock_t* lock) {
+  mi_lock_release(lock);
+}
+static inline void mi_lock_done(mi_lock_t* lock) {
+  (void)(lock);
+}
+
+#endif
+
+
 #endif // __MIMALLOC_ATOMIC_H
diff --git a/third-party/mimalloc/include/mimalloc/internal.h b/third-party/mimalloc/include/mimalloc/internal.h
index 6c6e5ed04f..e7e7b50835 100644
--- a/third-party/mimalloc/include/mimalloc/internal.h
+++ b/third-party/mimalloc/include/mimalloc/internal.h
@@ -10,7 +10,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 
 // --------------------------------------------------------------------------
-// This file contains the interal API's of mimalloc and various utility
+// This file contains the internal API's of mimalloc and various utility
 // functions and macros.
 // --------------------------------------------------------------------------
 
@@ -31,16 +31,25 @@ terms of the MIT license. A copy of the license can be found in the file
 #define mi_decl_thread          __declspec(thread)
 #define mi_decl_cache_align     __declspec(align(MI_CACHE_LINE))
 #define mi_decl_weak
+#define mi_decl_hidden
 #elif (defined(__GNUC__) && (__GNUC__ >= 3)) || defined(__clang__) // includes clang and icc
 #define mi_decl_noinline        __attribute__((noinline))
 #define mi_decl_thread          __thread
 #define mi_decl_cache_align     __attribute__((aligned(MI_CACHE_LINE)))
 #define mi_decl_weak            __attribute__((weak))
+#define mi_decl_hidden          __attribute__((visibility("hidden")))
+#elif __cplusplus >= 201103L    // c++11
+#define mi_decl_noinline
+#define mi_decl_thread          thread_local
+#define mi_decl_cache_align     alignas(MI_CACHE_LINE)
+#define mi_decl_weak
+#define mi_decl_hidden
 #else
 #define mi_decl_noinline
 #define mi_decl_thread          __thread        // hope for the best :-)
 #define mi_decl_cache_align
 #define mi_decl_weak
+#define mi_decl_hidden
 #endif
 
 #if defined(__EMSCRIPTEN__) && !defined(__wasi__)
@@ -53,150 +62,175 @@ terms of the MIT license. A copy of the license can be found in the file
 #define mi_decl_externc
 #endif
 
-// pthreads
-#if !defined(_WIN32) && !defined(__wasi__)
-#define  MI_USE_PTHREADS
-#include <pthread.h>
-#endif
+// "libc.c"
+#include    <stdarg.h>
+void        _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args);
+void        _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...);
+char        _mi_toupper(char c);
+int         _mi_strnicmp(const char* s, const char* t, size_t n);
+void        _mi_strlcpy(char* dest, const char* src, size_t dest_size);
+void        _mi_strlcat(char* dest, const char* src, size_t dest_size);
+size_t      _mi_strlen(const char* s);
+size_t      _mi_strnlen(const char* s, size_t max_len);
+bool        _mi_getenv(const char* name, char* result, size_t result_size);
 
 // "options.c"
-void       _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message);
-void       _mi_fprintf(mi_output_fun* out, void* arg, const char* fmt, ...);
-void       _mi_warning_message(const char* fmt, ...);
-void       _mi_verbose_message(const char* fmt, ...);
-void       _mi_trace_message(const char* fmt, ...);
-void       _mi_options_init(void);
-void       _mi_error_message(int err, const char* fmt, ...);
+void        _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message);
+void        _mi_fprintf(mi_output_fun* out, void* arg, const char* fmt, ...);
+void        _mi_warning_message(const char* fmt, ...);
+void        _mi_verbose_message(const char* fmt, ...);
+void        _mi_trace_message(const char* fmt, ...);
+void        _mi_options_init(void);
+long        _mi_option_get_fast(mi_option_t option);
+void        _mi_error_message(int err, const char* fmt, ...);
 
 // random.c
-void       _mi_random_init(mi_random_ctx_t* ctx);
-void       _mi_random_init_weak(mi_random_ctx_t* ctx);
-void       _mi_random_reinit_if_weak(mi_random_ctx_t * ctx);
-void       _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* new_ctx);
-uintptr_t  _mi_random_next(mi_random_ctx_t* ctx);
-uintptr_t  _mi_heap_random_next(mi_heap_t* heap);
-uintptr_t  _mi_os_random_weak(uintptr_t extra_seed);
+void        _mi_random_init(mi_random_ctx_t* ctx);
+void        _mi_random_init_weak(mi_random_ctx_t* ctx);
+void        _mi_random_reinit_if_weak(mi_random_ctx_t * ctx);
+void        _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* new_ctx);
+uintptr_t   _mi_random_next(mi_random_ctx_t* ctx);
+uintptr_t   _mi_heap_random_next(mi_heap_t* heap);
+uintptr_t   _mi_os_random_weak(uintptr_t extra_seed);
 static inline uintptr_t _mi_random_shuffle(uintptr_t x);
 
 // init.c
 extern mi_decl_cache_align mi_stats_t       _mi_stats_main;
-extern mi_decl_cache_align const mi_page_t  _mi_page_empty;
-bool       _mi_is_main_thread(void);
-size_t     _mi_current_thread_count(void);
-bool       _mi_preloading(void);           // true while the C runtime is not initialized yet
+extern mi_decl_hidden mi_decl_cache_align const mi_page_t  _mi_page_empty;
+void        _mi_process_load(void);
+void mi_cdecl _mi_process_done(void);
+bool        _mi_is_redirected(void);
+bool        _mi_allocator_init(const char** message);
+void        _mi_allocator_done(void);
+bool        _mi_is_main_thread(void);
+size_t      _mi_current_thread_count(void);
+bool        _mi_preloading(void);           // true while the C runtime is not initialized yet
+void        _mi_thread_done(mi_heap_t* heap);
+void        _mi_thread_data_collect(void);
+void        _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap);
 mi_threadid_t _mi_thread_id(void) mi_attr_noexcept;
 mi_heap_t*    _mi_heap_main_get(void);     // statically allocated main backing heap
-void       _mi_thread_done(mi_heap_t* heap);
-void       _mi_thread_data_collect(void);
-void       _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap);
+mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id);
+void        _mi_heap_guarded_init(mi_heap_t* heap);
 
 // os.c
-void       _mi_os_init(void);                                            // called from process init
-void*      _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* stats);
-void       _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* stats);
-void       _mi_os_free_ex(void* p, size_t size, bool still_committed, mi_memid_t memid, mi_stats_t* stats);
-
-size_t     _mi_os_page_size(void);
-size_t     _mi_os_good_alloc_size(size_t size);
-bool       _mi_os_has_overcommit(void);
-bool       _mi_os_has_virtual_reserve(void);
-
-bool       _mi_os_purge(void* p, size_t size, mi_stats_t* stats);
-bool       _mi_os_reset(void* addr, size_t size, mi_stats_t* tld_stats);
-bool       _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
-bool       _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
-bool       _mi_os_protect(void* addr, size_t size);
-bool       _mi_os_unprotect(void* addr, size_t size);
-bool       _mi_os_purge(void* p, size_t size, mi_stats_t* stats);
-bool       _mi_os_purge_ex(void* p, size_t size, bool allow_reset, mi_stats_t* stats);
-
-void*      _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* stats);
-void*      _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* tld_stats);
-
-void*      _mi_os_get_aligned_hint(size_t try_alignment, size_t size);
-bool       _mi_os_use_large_page(size_t size, size_t alignment);
-size_t     _mi_os_large_page_size(void);
-
-void*      _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid);
+void        _mi_os_init(void);                                            // called from process init
+void*       _mi_os_alloc(size_t size, mi_memid_t* memid);
+void        _mi_os_free(void* p, size_t size, mi_memid_t memid);
+void        _mi_os_free_ex(void* p, size_t size, bool still_committed, mi_memid_t memid);
+
+size_t      _mi_os_page_size(void);
+size_t      _mi_os_good_alloc_size(size_t size);
+bool        _mi_os_has_overcommit(void);
+bool        _mi_os_has_virtual_reserve(void);
+
+bool        _mi_os_reset(void* addr, size_t size);
+bool        _mi_os_commit(void* p, size_t size, bool* is_zero);
+bool        _mi_os_decommit(void* addr, size_t size);
+bool        _mi_os_protect(void* addr, size_t size);
+bool        _mi_os_unprotect(void* addr, size_t size);
+bool        _mi_os_purge(void* p, size_t size);
+bool        _mi_os_purge_ex(void* p, size_t size, bool allow_reset, size_t stat_size);
+
+void*       _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid);
+void*       _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_memid_t* memid);
+
+void*       _mi_os_get_aligned_hint(size_t try_alignment, size_t size);
+bool        _mi_os_use_large_page(size_t size, size_t alignment);
+size_t      _mi_os_large_page_size(void);
+
+void*       _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid);
 
 // arena.c
 mi_arena_id_t _mi_arena_id_none(void);
-void       _mi_arena_free(void* p, size_t size, size_t still_committed_size, mi_memid_t memid, mi_stats_t* stats);
-void*      _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld);
-void*      _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld);
-bool       _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id);
-bool       _mi_arena_contains(const void* p);
-void       _mi_arenas_collect(bool force_purge, mi_stats_t* stats);
-void       _mi_arena_unsafe_destroy_all(mi_stats_t* stats);
-
-bool       _mi_arena_segment_clear_abandoned(mi_segment_t* segment);
-void       _mi_arena_segment_mark_abandoned(mi_segment_t* segment);
-size_t     _mi_arena_segment_abandoned_count(void);
-
-typedef struct mi_arena_field_cursor_s { // abstract
-  mi_arena_id_t  start;
-  int            count;
-  size_t         bitmap_idx;
+void        _mi_arena_free(void* p, size_t size, size_t still_committed_size, mi_memid_t memid);
+void*       _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid);
+void*       _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid);
+bool        _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id);
+bool        _mi_arena_contains(const void* p);
+void        _mi_arenas_collect(bool force_purge);
+void        _mi_arena_unsafe_destroy_all(void);
+
+bool        _mi_arena_segment_clear_abandoned(mi_segment_t* segment);
+void        _mi_arena_segment_mark_abandoned(mi_segment_t* segment);
+
+void*       _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid);
+void        _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size);
+
+typedef struct mi_arena_field_cursor_s { // abstract struct
+  size_t         os_list_count;           // max entries to visit in the OS abandoned list
+  size_t         start;                   // start arena idx (may need to be wrapped)
+  size_t         end;                     // end arena idx (exclusive, may need to be wrapped)
+  size_t         bitmap_idx;              // current bit idx for an arena
+  mi_subproc_t*  subproc;                 // only visit blocks in this sub-process
+  bool           visit_all;               // ensure all abandoned blocks are seen (blocking)
+  bool           hold_visit_lock;         // if the subproc->abandoned_os_visit_lock is held
 } mi_arena_field_cursor_t;
-void          _mi_arena_field_cursor_init(mi_heap_t* heap, mi_arena_field_cursor_t* current);
+void          _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, bool visit_all, mi_arena_field_cursor_t* current);
 mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous);
+void          _mi_arena_field_cursor_done(mi_arena_field_cursor_t* current);
 
 // "segment-map.c"
-void       _mi_segment_map_allocated_at(const mi_segment_t* segment);
-void       _mi_segment_map_freed_at(const mi_segment_t* segment);
+void        _mi_segment_map_allocated_at(const mi_segment_t* segment);
+void        _mi_segment_map_freed_at(const mi_segment_t* segment);
+void        _mi_segment_map_unsafe_destroy(void);
 
 // "segment.c"
-mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld, mi_os_tld_t* os_tld);
+mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld);
 void       _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld);
 void       _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld);
 bool       _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segments_tld_t* tld);
-void       _mi_segment_collect(mi_segment_t* segment, bool force, mi_segments_tld_t* tld);
+void       _mi_segment_collect(mi_segment_t* segment, bool force);
 
 #if MI_HUGE_PAGE_ABANDON
-void       _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
+void        _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
 #else
-void       _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
+void        _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
 #endif
 
 uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size); // page start for any page
 void       _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld);
-void       _mi_abandoned_await_readers(void);
 void       _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld);
 bool       _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment);
+bool       _mi_segment_visit_blocks(mi_segment_t* segment, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg);
 
 // "page.c"
-void*      _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment)  mi_attr_noexcept mi_attr_malloc;
+void*       _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment)  mi_attr_noexcept mi_attr_malloc;
+
+void        _mi_page_retire(mi_page_t* page) mi_attr_noexcept;                  // free the page if there are no other pages with many free blocks
+void        _mi_page_unfull(mi_page_t* page);
+void        _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force);   // free the page
+void        _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq);            // abandon the page, to be picked up by another thread...
+void        _mi_page_force_abandon(mi_page_t* page);
 
-void       _mi_page_retire(mi_page_t* page) mi_attr_noexcept;                  // free the page if there are no other pages with many free blocks
-void       _mi_page_unfull(mi_page_t* page);
-void       _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force);   // free the page
-void       _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq);            // abandon the page, to be picked up by another thread...
-void       _mi_heap_delayed_free_all(mi_heap_t* heap);
-bool       _mi_heap_delayed_free_partial(mi_heap_t* heap);
-void       _mi_heap_collect_retired(mi_heap_t* heap, bool force);
+void        _mi_heap_delayed_free_all(mi_heap_t* heap);
+bool        _mi_heap_delayed_free_partial(mi_heap_t* heap);
+void        _mi_heap_collect_retired(mi_heap_t* heap, bool force);
 
-void       _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never);
-bool       _mi_page_try_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never);
-size_t     _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append);
-void       _mi_deferred_free(mi_heap_t* heap, bool force);
+void        _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never);
+bool        _mi_page_try_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never);
+size_t      _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append);
+void        _mi_deferred_free(mi_heap_t* heap, bool force);
 
-void       _mi_page_free_collect(mi_page_t* page,bool force);
-void       _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page);   // callback from segments
+void        _mi_page_free_collect(mi_page_t* page,bool force);
+void        _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page);   // callback from segments
 
-size_t     _mi_bin_size(uint8_t bin);           // for stats
-uint8_t    _mi_bin(size_t size);                // for stats
+size_t      _mi_bin_size(uint8_t bin);           // for stats
+uint8_t     _mi_bin(size_t size);                // for stats
 
 // "heap.c"
-void       _mi_heap_init(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag);
-void       _mi_heap_destroy_pages(mi_heap_t* heap);
-void       _mi_heap_collect_abandon(mi_heap_t* heap);
-void       _mi_heap_set_default_direct(mi_heap_t* heap);
-bool       _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid);
-void       _mi_heap_unsafe_destroy_all(void);
-mi_heap_t* _mi_heap_by_tag(mi_heap_t* heap, uint8_t tag);
+void        _mi_heap_init(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag);
+void        _mi_heap_destroy_pages(mi_heap_t* heap);
+void        _mi_heap_collect_abandon(mi_heap_t* heap);
+void        _mi_heap_set_default_direct(mi_heap_t* heap);
+bool        _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid);
+void        _mi_heap_unsafe_destroy_all(mi_heap_t* heap);
+mi_heap_t*  _mi_heap_by_tag(mi_heap_t* heap, uint8_t tag);
+void        _mi_heap_area_init(mi_heap_area_t* area, mi_page_t* page);
+bool        _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_block_visit_fun* visitor, void* arg);
 
 // "stats.c"
-void       _mi_stats_done(mi_stats_t* stats);
+void        _mi_stats_done(mi_stats_t* stats);
 mi_msecs_t  _mi_clock_now(void);
 mi_msecs_t  _mi_clock_end(mi_msecs_t start);
 mi_msecs_t  _mi_clock_start(void);
@@ -213,18 +247,6 @@ bool        _mi_free_delayed_block(mi_block_t* block);
 void        _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept;  // for runtime integration
 void        _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size);
 
-// "libc.c"
-#include    <stdarg.h>
-void        _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args);
-void        _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...);
-char        _mi_toupper(char c);
-int         _mi_strnicmp(const char* s, const char* t, size_t n);
-void        _mi_strlcpy(char* dest, const char* src, size_t dest_size);
-void        _mi_strlcat(char* dest, const char* src, size_t dest_size);
-size_t      _mi_strlen(const char* s);
-size_t      _mi_strnlen(const char* s, size_t max_len);
-bool        _mi_getenv(const char* name, char* result, size_t result_size);
-
 #if MI_DEBUG>1
 bool        _mi_page_is_valid(mi_page_t* page);
 #endif
@@ -349,6 +371,14 @@ static inline uintptr_t _mi_divide_up(uintptr_t size, size_t divider) {
   return (divider == 0 ? size : ((size + divider - 1) / divider));
 }
 
+
+// clamp an integer
+static inline size_t _mi_clamp(size_t sz, size_t min, size_t max) {
+  if (sz < min) return min;
+  else if (sz > max) return max;
+  else return sz;
+}
+
 // Is memory zero initialized?
 static inline bool mi_mem_is_zero(const void* p, size_t size) {
   for (size_t i = 0; i < size; i++) {
@@ -410,7 +440,7 @@ static inline bool mi_count_size_overflow(size_t count, size_t size, size_t* tot
   Heap functions
 ------------------------------------------------------------------------------------------- */
 
-extern const mi_heap_t _mi_heap_empty;  // read-only empty heap, initial value of the thread local default heap
+extern mi_decl_hidden const mi_heap_t _mi_heap_empty;  // read-only empty heap, initial value of the thread local default heap
 
 static inline bool mi_heap_is_backing(const mi_heap_t* heap) {
   return (heap->tld->heap_backing == heap);
@@ -418,11 +448,11 @@ static inline bool mi_heap_is_backing(const mi_heap_t* heap) {
 
 static inline bool mi_heap_is_initialized(mi_heap_t* heap) {
   mi_assert_internal(heap != NULL);
-  return (heap != &_mi_heap_empty);
+  return (heap != NULL && heap != &_mi_heap_empty);
 }
 
 static inline uintptr_t _mi_ptr_cookie(const void* p) {
-  extern mi_heap_t _mi_heap_main;
+  extern mi_decl_hidden mi_heap_t _mi_heap_main;
   mi_assert_internal(_mi_heap_main.cookie != 0);
   return ((uintptr_t)p ^ _mi_heap_main.cookie);
 }
@@ -589,7 +619,7 @@ static inline bool mi_page_immediate_available(const mi_page_t* page) {
 }
 
 // is more than 7/8th of a page in use?
-static inline bool mi_page_mostly_used(const mi_page_t* page) {
+static inline bool mi_page_is_mostly_used(const mi_page_t* page) {
   if (page==NULL) return true;
   uint16_t frac = page->reserved / 8U;
   return (page->reserved - page->used <= frac);
@@ -620,6 +650,39 @@ static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) {
   page->flags.x.has_aligned = has_aligned;
 }
 
+/* -------------------------------------------------------------------
+  Guarded objects
+------------------------------------------------------------------- */
+#if MI_GUARDED
+static inline bool mi_block_ptr_is_guarded(const mi_block_t* block, const void* p) {
+  const ptrdiff_t offset = (uint8_t*)p - (uint8_t*)block;
+  return (offset >= (ptrdiff_t)(sizeof(mi_block_t)) && block->next == MI_BLOCK_TAG_GUARDED);
+}
+
+static inline bool mi_heap_malloc_use_guarded(mi_heap_t* heap, size_t size) {
+  // this code is written to result in fast assembly as it is on the hot path for allocation
+  const size_t count = heap->guarded_sample_count - 1;  // if the rate was 0, this will underflow and count for a long time..
+  if mi_likely(count != 0) {
+    // no sample
+    heap->guarded_sample_count = count;
+    return false;
+  }
+  else if (size >= heap->guarded_size_min && size <= heap->guarded_size_max) {
+    // use guarded allocation
+    heap->guarded_sample_count = heap->guarded_sample_rate;  // reset
+    return (heap->guarded_sample_rate != 0);
+  }
+  else {
+    // failed size criteria, rewind count (but don't write to an empty heap)
+    if (heap->guarded_sample_rate != 0) { heap->guarded_sample_count = 1; }
+    return false;
+  }
+}
+
+mi_decl_restrict void* _mi_heap_malloc_guarded(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept;
+
+#endif
+
 
 /* -------------------------------------------------------------------
 Encoding/Decoding the free list next pointers
@@ -679,6 +742,16 @@ static inline mi_encoded_t mi_ptr_encode(const void* null, const void* p, const
   return mi_rotl(x ^ keys[1], keys[0]) + keys[0];
 }
 
+static inline uint32_t mi_ptr_encode_canary(const void* null, const void* p, const uintptr_t* keys) {
+  const uint32_t x = (uint32_t)(mi_ptr_encode(null,p,keys));
+  // make the lowest byte 0 to prevent spurious read overflows which could be a security issue (issue #951)
+  #ifdef MI_BIG_ENDIAN
+  return (x & 0x00FFFFFF);
+  #else
+  return (x & 0xFFFFFF00);
+  #endif
+}
+
 static inline mi_block_t* mi_block_nextx( const void* null, const mi_block_t* block, const uintptr_t* keys ) {
   mi_track_mem_defined(block,sizeof(mi_block_t));
   mi_block_t* next;
@@ -803,7 +876,7 @@ static inline mi_memid_t _mi_memid_create_os(bool committed, bool is_zero, bool
 
 static inline uintptr_t _mi_random_shuffle(uintptr_t x) {
   if (x==0) { x = 17; }   // ensure we don't get stuck in generating zeros
-#if (MI_INTPTR_SIZE==8)
+#if (MI_INTPTR_SIZE>=8)
   // by Sebastiano Vigna, see: <http://xoshiro.di.unimi.it/splitmix64.c>
   x ^= x >> 30;
   x *= 0xbf58476d1ce4e5b9UL;
@@ -825,13 +898,13 @@ static inline uintptr_t _mi_random_shuffle(uintptr_t x) {
 // Optimize numa node access for the common case (= one node)
 // -------------------------------------------------------------------
 
-int    _mi_os_numa_node_get(mi_os_tld_t* tld);
+int    _mi_os_numa_node_get(void);
 size_t _mi_os_numa_node_count_get(void);
 
-extern _Atomic(size_t) _mi_numa_node_count;
-static inline int _mi_os_numa_node(mi_os_tld_t* tld) {
+extern mi_decl_hidden _Atomic(size_t) _mi_numa_node_count;
+static inline int _mi_os_numa_node(void) {
   if mi_likely(mi_atomic_load_relaxed(&_mi_numa_node_count) == 1) { return 0; }
-  else return _mi_os_numa_node_get(tld);
+  else return _mi_os_numa_node_get();
 }
 static inline size_t _mi_os_numa_node_count(void) {
   const size_t count = mi_atomic_load_relaxed(&_mi_numa_node_count);
@@ -893,16 +966,18 @@ static inline size_t mi_ctz(uintptr_t x) {
 }
 
 #else
-static inline size_t mi_ctz32(uint32_t x) {
+
+static inline size_t mi_ctz_generic32(uint32_t x) {
   // de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
-  static const unsigned char debruijn[32] = {
+  static const uint8_t debruijn[32] = {
     0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
     31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
   };
   if (x==0) return 32;
-  return debruijn[((x & -(int32_t)x) * 0x077CB531UL) >> 27];
+  return debruijn[(uint32_t)((x & -(int32_t)x) * (uint32_t)(0x077CB531U)) >> 27];
 }
-static inline size_t mi_clz32(uint32_t x) {
+
+static inline size_t mi_clz_generic32(uint32_t x) {
   // de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
   static const uint8_t debruijn[32] = {
     31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1,
@@ -914,28 +989,37 @@ static inline size_t mi_clz32(uint32_t x) {
   x |= x >> 4;
   x |= x >> 8;
   x |= x >> 16;
-  return debruijn[(uint32_t)(x * 0x07C4ACDDUL) >> 27];
+  return debruijn[(uint32_t)(x * (uint32_t)(0x07C4ACDDU)) >> 27];
 }
 
-static inline size_t mi_clz(uintptr_t x) {
-  if (x==0) return MI_INTPTR_BITS;
-#if (MI_INTPTR_BITS <= 32)
-  return mi_clz32((uint32_t)x);
-#else
-  size_t count = mi_clz32((uint32_t)(x >> 32));
-  if (count < 32) return count;
-  return (32 + mi_clz32((uint32_t)x));
-#endif
+static inline size_t mi_ctz(size_t x) {
+  if (x==0) return MI_SIZE_BITS;
+  #if (MI_SIZE_BITS <= 32)
+    return mi_ctz_generic32((uint32_t)x);
+  #else
+    const uint32_t lo = (uint32_t)x;
+    if (lo != 0) {
+      return mi_ctz_generic32(lo);
+    }
+    else {
+      return (32 + mi_ctz_generic32((uint32_t)(x>>32)));
+    }
+  #endif
 }
-static inline size_t mi_ctz(uintptr_t x) {
-  if (x==0) return MI_INTPTR_BITS;
-#if (MI_INTPTR_BITS <= 32)
-  return mi_ctz32((uint32_t)x);
-#else
-  size_t count = mi_ctz32((uint32_t)x);
-  if (count < 32) return count;
-  return (32 + mi_ctz32((uint32_t)(x>>32)));
-#endif
+
+static inline size_t mi_clz(size_t x) {
+  if (x==0) return MI_SIZE_BITS;
+  #if (MI_SIZE_BITS <= 32)
+    return mi_clz_generic32((uint32_t)x);
+  #else
+    const uint32_t hi = (uint32_t)(x>>32);
+    if (hi != 0) {
+      return mi_clz_generic32(hi);
+    }
+    else {
+      return 32 + mi_clz_generic32((uint32_t)x);
+    }
+  #endif
 }
 
 #endif
@@ -957,8 +1041,9 @@ static inline size_t mi_bsr(uintptr_t x) {
 #if !MI_TRACK_ENABLED && defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
 #include <intrin.h>
 extern bool _mi_cpu_has_fsrm;
+extern bool _mi_cpu_has_erms;
 static inline void _mi_memcpy(void* dst, const void* src, size_t n) {
-  if (_mi_cpu_has_fsrm) {
+  if ((_mi_cpu_has_fsrm && n <= 128) || (_mi_cpu_has_erms && n > 128)) {
     __movsb((unsigned char*)dst, (const unsigned char*)src, n);
   }
   else {
@@ -966,7 +1051,7 @@ static inline void _mi_memcpy(void* dst, const void* src, size_t n) {
   }
 }
 static inline void _mi_memzero(void* dst, size_t n) {
-  if (_mi_cpu_has_fsrm) {
+  if ((_mi_cpu_has_fsrm && n <= 128) || (_mi_cpu_has_erms && n > 128)) {
     __stosb((unsigned char*)dst, 0, n);
   }
   else {
diff --git a/third-party/mimalloc/include/mimalloc/prim.h b/third-party/mimalloc/include/mimalloc/prim.h
index 3f4574ddd9..606b719943 100644
--- a/third-party/mimalloc/include/mimalloc/prim.h
+++ b/third-party/mimalloc/include/mimalloc/prim.h
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -25,6 +25,8 @@ typedef struct mi_os_mem_config_s {
   size_t  page_size;            // default to 4KiB
   size_t  large_page_size;      // 0 if not supported, usually 2MiB (4MiB on Windows)
   size_t  alloc_granularity;    // smallest allocation size (usually 4KiB, on Windows 64KiB)
+  size_t  physical_memory;      // physical memory size
+  size_t  virtual_address_bits; // usually 48 or 56 bits on 64-bit systems. (used to determine secure randomization)
   bool    has_overcommit;       // can we reserve more memory than can be actually committed?
   bool    has_partial_free;     // can allocated blocks be freed partially? (true for mmap, false for VirtualAlloc)
   bool    has_virtual_reserve;  // supports virtual address space reservation? (if true we can reserve virtual address space without using commit or physical memory)
@@ -41,9 +43,10 @@ int _mi_prim_free(void* addr, size_t size );
 // If `commit` is false, the virtual memory range only needs to be reserved (with no access)
 // which will later be committed explicitly using `_mi_prim_commit`.
 // `is_zero` is set to true if the memory was zero initialized (as on most OS's)
+// The `hint_addr` address is either `NULL` or a preferred allocation address but can be ignored.
 // pre: !commit => !allow_large
 //      try_alignment >= _mi_os_page_size() and a power of 2
-int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr);
+int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr);
 
 // Commit memory. Returns error code or 0 on success.
 // For example, on Linux this would make the memory PROT_READ|PROT_WRITE.
@@ -115,14 +118,14 @@ void _mi_prim_thread_done_auto_done(void);
 void _mi_prim_thread_associate_default_heap(mi_heap_t* heap);
 
 
+
+
+
 //-------------------------------------------------------------------
-// Thread id: `_mi_prim_thread_id()`
-//
-// Getting the thread id should be performant as it is called in the
-// fast path of `_mi_free` and we specialize for various platforms as
-// inlined definitions. Regular code should call `init.c:_mi_thread_id()`.
-// We only require _mi_prim_thread_id() to return a unique id
-// for each thread (unequal to zero).
+// Access to TLS (thread local storage) slots.
+// We need fast access to both a unique thread id (in `free.c:mi_free`) and
+// to a thread-local heap pointer (in `alloc.c:mi_malloc`). 
+// To achieve this we use specialized code for various platforms.
 //-------------------------------------------------------------------
 
 // On some libc + platform combinations we can directly access a thread-local storage (TLS) slot.
@@ -134,14 +137,14 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap);
 // but unfortunately we can not detect support reliably (see issue #883)
 // We also use it on Apple OS as we use a TLS slot for the default heap there.
 #if defined(__GNUC__) && ( \
-           (defined(__GLIBC__)   && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \
+           (defined(__GLIBC__)   && (defined(__x86_64__) || defined(__i386__) || (defined(__arm__) && __ARM_ARCH >= 7) || defined(__aarch64__))) \
         || (defined(__APPLE__)   && (defined(__x86_64__) || defined(__aarch64__) || defined(__POWERPC__))) \
-        || (defined(__BIONIC__)  && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \
+        || (defined(__BIONIC__)  && (defined(__x86_64__) || defined(__i386__) || (defined(__arm__) && __ARM_ARCH >= 7) || defined(__aarch64__))) \
         || (defined(__FreeBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
         || (defined(__OpenBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
       )
 
-#define MI_HAS_TLS_SLOT
+#define MI_HAS_TLS_SLOT    1
 
 static inline void* mi_prim_tls_slot(size_t slot) mi_attr_noexcept {
   void* res;
@@ -202,13 +205,58 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce
   #endif
 }
 
+#elif _WIN32 && MI_WIN_USE_FIXED_TLS && !defined(MI_WIN_USE_FLS)
+
+// On windows we can store the thread-local heap at a fixed TLS slot to avoid
+// thread-local initialization checks in the fast path. This uses a fixed location
+// in the TCB though (last user-reserved slot by default) which may clash with other applications.
+
+#define MI_HAS_TLS_SLOT      2              // 2 = we can reliably initialize the slot (saving a test on each malloc)
+
+#if MI_WIN_USE_FIXED_TLS > 1
+#define MI_TLS_SLOT     (MI_WIN_USE_FIXED_TLS)
+#elif MI_SIZE_SIZE == 4
+#define MI_TLS_SLOT     (0x710)             // Last user-reserved slot <https://en.wikipedia.org/wiki/Win32_Thread_Information_Block>
+// #define MI_TLS_SLOT  (0xF0C)             // Last TlsSlot (might clash with other app reserved slot)
+#else
+#define MI_TLS_SLOT     (0x888)             // Last user-reserved slot <https://en.wikipedia.org/wiki/Win32_Thread_Information_Block>
+// #define MI_TLS_SLOT  (0x1678)            // Last TlsSlot (might clash with other app reserved slot)
+#endif
+
+static inline void* mi_prim_tls_slot(size_t slot) mi_attr_noexcept {
+  #if (_M_X64 || _M_AMD64) && !defined(_M_ARM64EC)
+  return (void*)__readgsqword((unsigned long)slot);   // direct load at offset from gs
+  #elif _M_IX86 && !defined(_M_ARM64EC)
+  return (void*)__readfsdword((unsigned long)slot);   // direct load at offset from fs
+  #else
+  return ((void**)NtCurrentTeb())[slot / sizeof(void*)];
+  #endif
+}
+static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexcept {
+  ((void**)NtCurrentTeb())[slot / sizeof(void*)] = value;
+}
+
 #endif
 
+
+
+//-------------------------------------------------------------------
+// Get a fast unique thread id.
+//
+// Getting the thread id should be performant as it is called in the
+// fast path of `_mi_free` and we specialize for various platforms as
+// inlined definitions. Regular code should call `init.c:_mi_thread_id()`.
+// We only require _mi_prim_thread_id() to return a unique id
+// for each thread (unequal to zero).
+//-------------------------------------------------------------------
+
+
 // Do we have __builtin_thread_pointer? This would be the preferred way to get a unique thread id
 // but unfortunately, it seems we cannot test for this reliably at this time (see issue #883)
 // Nevertheless, it seems needed on older graviton platforms (see issue #851).
 // For now, we only enable this for specific platforms.
 #if !defined(__APPLE__)  /* on apple (M1) the wrong register is read (tpidr_el0 instead of tpidrro_el0) so fall back to TLS slot assembly (<https://github.com/microsoft/mimalloc/issues/343#issuecomment-763272369>)*/ \
+    && !defined(__CYGWIN__) \
     && !defined(MI_LIBC_MUSL) \
     && (!defined(__clang_major__) || __clang_major__ >= 14)  /* older clang versions emit bad code; fall back to using the TLS slot (<https://lore.kernel.org/linux-arm-kernel/202110280952.352F66D8@keescook/T/>) */
   #if    (defined(__GNUC__) && (__GNUC__ >= 7)  && defined(__aarch64__)) /* aarch64 for older gcc versions (issue #851) */ \
@@ -235,10 +283,6 @@ static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
 
 #elif defined(_WIN32)
 
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif
-#include <windows.h>
 static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
   // Windows: works on Intel and ARM in both 32- and 64-bit
   return (uintptr_t)NtCurrentTeb();
@@ -251,7 +295,7 @@ static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
   return (uintptr_t)__builtin_thread_pointer();
 }
 
-#elif defined(MI_HAS_TLS_SLOT)
+#elif MI_HAS_TLS_SLOT
 
 static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
   #if defined(__BIONIC__)
@@ -278,7 +322,8 @@ static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
 
 
 /* ----------------------------------------------------------------------------------------
-The thread local default heap: `_mi_prim_get_default_heap()`
+Get the thread local default heap: `_mi_prim_get_default_heap()`
+
 This is inlined here as it is on the fast path for allocation functions.
 
 On most platforms (Windows, Linux, FreeBSD, NetBSD, etc), this just returns a
@@ -315,19 +360,21 @@ static inline mi_heap_t* mi_prim_get_default_heap(void);
 #endif
 
 
-#if defined(MI_TLS_SLOT)
+#if MI_TLS_SLOT
 # if !defined(MI_HAS_TLS_SLOT)
 #  error "trying to use a TLS slot for the default heap, but the mi_prim_tls_slot primitives are not defined"
 # endif
 
 static inline mi_heap_t* mi_prim_get_default_heap(void) {
   mi_heap_t* heap = (mi_heap_t*)mi_prim_tls_slot(MI_TLS_SLOT);
+  #if MI_HAS_TLS_SLOT == 1   // check if the TLS slot is initialized
   if mi_unlikely(heap == NULL) {
     #ifdef __GNUC__
     __asm(""); // prevent conditional load of the address of _mi_heap_empty
     #endif
     heap = (mi_heap_t*)&_mi_heap_empty;
   }
+  #endif
   return heap;
 }
 
@@ -369,5 +416,4 @@ static inline mi_heap_t* mi_prim_get_default_heap(void) {
 #endif  // mi_prim_get_default_heap()
 
 
-
 #endif  // MIMALLOC_PRIM_H
diff --git a/third-party/mimalloc/include/mimalloc/track.h b/third-party/mimalloc/include/mimalloc/track.h
index a659d94044..4b5709e2b5 100644
--- a/third-party/mimalloc/include/mimalloc/track.h
+++ b/third-party/mimalloc/include/mimalloc/track.h
@@ -34,7 +34,7 @@ The corresponding `mi_track_free` still uses the block start pointer and origina
 The `mi_track_resize` is currently unused but could be called on reallocations within a block.
 `mi_track_init` is called at program start.
 
-The following macros are for tools like asan and valgrind to track whether memory is 
+The following macros are for tools like asan and valgrind to track whether memory is
 defined, undefined, or not accessible at all:
 
   #define mi_track_mem_defined(p,size)
@@ -82,10 +82,6 @@ defined, undefined, or not accessible at all:
 #define MI_TRACK_HEAP_DESTROY 1
 #define MI_TRACK_TOOL         "ETW"
 
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif
-#include <windows.h>
 #include "../src/prim/windows/etw.h"
 
 #define mi_track_init()                           EventRegistermicrosoft_windows_mimalloc();
@@ -96,7 +92,7 @@ defined, undefined, or not accessible at all:
 // no tracking
 
 #define MI_TRACK_ENABLED      0
-#define MI_TRACK_HEAP_DESTROY 0 
+#define MI_TRACK_HEAP_DESTROY 0
 #define MI_TRACK_TOOL         "none"
 
 #define mi_track_malloc_size(p,reqsize,size,zero)
diff --git a/third-party/mimalloc/include/mimalloc/types.h b/third-party/mimalloc/include/mimalloc/types.h
index 2fdde904bb..df6ff26c4f 100644
--- a/third-party/mimalloc/include/mimalloc/types.h
+++ b/third-party/mimalloc/include/mimalloc/types.h
@@ -73,6 +73,13 @@ terms of the MIT license. A copy of the license can be found in the file
 #endif
 #endif
 
+// Use guard pages behind objects of a certain size (set by the MIMALLOC_DEBUG_GUARDED_MIN/MAX options)
+// Padding should be disabled when using guard pages
+// #define MI_GUARDED 1
+#if defined(MI_GUARDED)
+#define MI_PADDING  0
+#endif
+
 // Reserve extra padding at the end of each block to be more resilient against heap block overflows.
 // The padding can detect buffer overflow on free.
 #if !defined(MI_PADDING) && (MI_SECURE>=3 || MI_DEBUG>=1 || (MI_TRACK_VALGRIND || MI_TRACK_ASAN || MI_TRACK_ETW))
@@ -193,7 +200,7 @@ typedef int32_t  mi_ssize_t;
 #define MI_SMALL_OBJ_SIZE_MAX             (MI_SMALL_PAGE_SIZE/4)   // 8KiB on 64-bit
 #define MI_MEDIUM_OBJ_SIZE_MAX            (MI_MEDIUM_PAGE_SIZE/4)  // 128KiB on 64-bit
 #define MI_MEDIUM_OBJ_WSIZE_MAX           (MI_MEDIUM_OBJ_SIZE_MAX/MI_INTPTR_SIZE)
-#define MI_LARGE_OBJ_SIZE_MAX             (MI_SEGMENT_SIZE/2)      // 32MiB on 64-bit
+#define MI_LARGE_OBJ_SIZE_MAX             (MI_SEGMENT_SIZE/2)      // 16MiB on 64-bit
 #define MI_LARGE_OBJ_WSIZE_MAX            (MI_LARGE_OBJ_SIZE_MAX/MI_INTPTR_SIZE)
 
 // Maximum number of size classes. (spaced exponentially in 12.5% increments)
@@ -237,13 +244,20 @@ typedef struct mi_block_s {
   mi_encoded_t next;
 } mi_block_t;
 
+#if MI_GUARDED
+// we always align guarded pointers in a block at an offset
+// the block `next` field is then used as a tag to distinguish regular offset aligned blocks from guarded ones
+#define MI_BLOCK_TAG_ALIGNED   ((mi_encoded_t)(0))
+#define MI_BLOCK_TAG_GUARDED   (~MI_BLOCK_TAG_ALIGNED)
+#endif
+
 
 // The delayed flags are used for efficient multi-threaded free-ing
 typedef enum mi_delayed_e {
   MI_USE_DELAYED_FREE   = 0, // push on the owning heap thread delayed list
   MI_DELAYED_FREEING    = 1, // temporary: another thread is accessing the owning heap
   MI_NO_DELAYED_FREE    = 2, // optimize: push on page local thread free queue if another block is already in the heap thread delayed free list
-  MI_NEVER_DELAYED_FREE = 3  // sticky: used for abondoned pages without a owning heap; this only resets on page reclaim
+  MI_NEVER_DELAYED_FREE = 3  // sticky: used for abandoned pages without a owning heap; this only resets on page reclaim
 } mi_delayed_t;
 
 
@@ -260,7 +274,7 @@ typedef union mi_page_flags_s {
 #else
 // under thread sanitizer, use a byte for each flag to suppress warning, issue #130
 typedef union mi_page_flags_s {
-  uint16_t full_aligned;
+  uint32_t full_aligned;
   struct {
     uint8_t in_full;
     uint8_t has_aligned;
@@ -319,7 +333,7 @@ typedef struct mi_page_s {
   mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
   uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
   uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
-  uint8_t               heap_tag;          // tag of the owning heap, used for separated heaps by object type
+  uint8_t               heap_tag;          // tag of the owning heap, used to separate heaps by object type
                                            // padding
   size_t                block_size;        // size available in each block (always `>0`)
   uint8_t*              page_start;        // start of the page area containing the blocks
@@ -408,7 +422,7 @@ static inline bool mi_memkind_is_os(mi_memkind_t memkind) {
 
 typedef struct mi_memid_os_info {
   void*         base;               // actual base address of the block (used for offset aligned allocations)
-  size_t        alignment;          // alignment at allocation
+  size_t        size;               // full allocation size
 } mi_memid_os_info_t;
 
 typedef struct mi_memid_arena_info {
@@ -430,7 +444,7 @@ typedef struct mi_memid_s {
 
 
 // -----------------------------------------------------------------------------------------
-// Segments are large allocated memory blocks (8mb on 64 bit) from arenas or the OS.
+// Segments are large allocated memory blocks (32mb on 64 bit) from arenas or the OS.
 //
 // Inside segments we allocated fixed size mimalloc pages (`mi_page_t`) that contain blocks.
 // The start of a segment is this structure with a fixed number of slice entries (`slices`)
@@ -442,12 +456,16 @@ typedef struct mi_memid_s {
 // For slices, the `block_size` field is repurposed to signify if a slice is used (`1`) or not (`0`).
 // Small and medium pages use a fixed amount of slices to reduce slice fragmentation, while
 // large and huge pages span a variable amount of slices.
+
+typedef struct mi_subproc_s mi_subproc_t;
+
 typedef struct mi_segment_s {
   // constant fields
   mi_memid_t        memid;              // memory id for arena/OS allocation
   bool              allow_decommit;     // can we decommmit the memory
   bool              allow_purge;        // can we purge the memory (reset or decommit)
   size_t            segment_size;
+  mi_subproc_t*     subproc;            // segment belongs to sub process
 
   // segment fields
   mi_msecs_t        purge_expire;       // purge slices in the `purge_mask` after this time
@@ -457,12 +475,16 @@ typedef struct mi_segment_s {
   // from here is zero initialized
   struct mi_segment_s* next;            // the list of freed segments in the cache (must be first field, see `segment.c:mi_segment_init`)
   bool              was_reclaimed;      // true if it was reclaimed (used to limit on-free reclamation)
+  bool              dont_free;          // can be temporarily true to ensure the segment is not freed
 
   size_t            abandoned;          // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
   size_t            abandoned_visits;   // count how often this segment is visited during abondoned reclamation (to force reclaim if it takes too long)
   size_t            used;               // count of pages in use
   uintptr_t         cookie;             // verify addresses in debug mode: `mi_ptr_cookie(segment) == segment->cookie`
 
+  struct mi_segment_s* abandoned_os_next; // only used for abandoned segments outside arena's, and only if `mi_option_visit_abandoned` is enabled
+  struct mi_segment_s* abandoned_os_prev;
+
   size_t            segment_slices;      // for huge segments this may be different from `MI_SLICES_PER_SEGMENT`
   size_t            segment_info_slices; // initial count of slices that we are using for segment info and possible guard pages.
 
@@ -540,6 +562,13 @@ struct mi_heap_s {
   mi_heap_t*            next;                                // list of heaps per thread
   bool                  no_reclaim;                          // `true` if this heap should not reclaim abandoned pages
   uint8_t               tag;                                 // custom tag, can be used for separating heaps based on the object types
+  #if MI_GUARDED
+  size_t                guarded_size_min;                    // minimal size for guarded objects
+  size_t                guarded_size_max;                    // maximal size for guarded objects
+  size_t                guarded_sample_rate;                 // sample rate (set to 0 to disable guarded pages)
+  size_t                guarded_sample_seed;                 // starting sample count
+  size_t                guarded_sample_count;                // current sample count (counting down to 0)
+  #endif
   mi_page_t*            pages_free_direct[MI_PAGES_DIRECT];  // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
   mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")
 };
@@ -633,24 +662,34 @@ typedef struct mi_stats_s {
   mi_stat_counter_t arena_count;
   mi_stat_counter_t arena_crossover_count;
   mi_stat_counter_t arena_rollback_count;
+  mi_stat_counter_t guarded_alloc_count;
 #if MI_STAT>1
   mi_stat_count_t normal_bins[MI_BIN_HUGE+1];
 #endif
 } mi_stats_t;
 
 
+// add to stat keeping track of the peak
 void _mi_stat_increase(mi_stat_count_t* stat, size_t amount);
 void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount);
+// adjust stat in special cases to compensate for double counting
+void _mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount);
+void _mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount);
+// counters can just be increased
 void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
 
 #if (MI_STAT)
 #define mi_stat_increase(stat,amount)         _mi_stat_increase( &(stat), amount)
 #define mi_stat_decrease(stat,amount)         _mi_stat_decrease( &(stat), amount)
 #define mi_stat_counter_increase(stat,amount) _mi_stat_counter_increase( &(stat), amount)
+#define mi_stat_adjust_increase(stat,amount)  _mi_stat_adjust_increase( &(stat), amount)
+#define mi_stat_adjust_decrease(stat,amount)  _mi_stat_adjust_decrease( &(stat), amount)
 #else
-#define mi_stat_increase(stat,amount)         (void)0
-#define mi_stat_decrease(stat,amount)         (void)0
-#define mi_stat_counter_increase(stat,amount) (void)0
+#define mi_stat_increase(stat,amount)         ((void)0)
+#define mi_stat_decrease(stat,amount)         ((void)0)
+#define mi_stat_counter_increase(stat,amount) ((void)0)
+#define mi_stat_adjuct_increase(stat,amount)  ((void)0)
+#define mi_stat_adjust_decrease(stat,amount)  ((void)0)
 #endif
 
 #define mi_heap_stat_counter_increase(heap,stat,amount)  mi_stat_counter_increase( (heap)->tld->stats.stat, amount)
@@ -658,6 +697,21 @@ void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
 #define mi_heap_stat_decrease(heap,stat,amount)  mi_stat_decrease( (heap)->tld->stats.stat, amount)
 
 
+// ------------------------------------------------------
+// Sub processes do not reclaim or visit segments
+// from other sub processes
+// ------------------------------------------------------
+
+struct mi_subproc_s {
+  _Atomic(size_t)    abandoned_count;         // count of abandoned segments for this sub-process
+  _Atomic(size_t)    abandoned_os_list_count; // count of abandoned segments in the os-list
+  mi_lock_t          abandoned_os_lock;       // lock for the abandoned os segment list (outside of arena's) (this lock protect list operations)
+  mi_lock_t          abandoned_os_visit_lock; // ensure only one thread per subproc visits the abandoned os list
+  mi_segment_t*      abandoned_os_list;       // doubly-linked list of abandoned segments outside of arena's (in OS allocated memory)
+  mi_segment_t*      abandoned_os_list_tail;  // the tail-end of the list
+  mi_memid_t         memid;                   // provenance of this memory block
+};
+
 // ------------------------------------------------------
 // Thread Local data
 // ------------------------------------------------------
@@ -672,13 +726,6 @@ typedef struct mi_span_queue_s {
 
 #define MI_SEGMENT_BIN_MAX (35)     // 35 == mi_segment_bin(MI_SLICES_PER_SEGMENT)
 
-// OS thread local data
-typedef struct mi_os_tld_s {
-  size_t                region_idx;   // start point for next allocation
-  mi_stats_t*           stats;        // points to tld stats
-} mi_os_tld_t;
-
-
 // Segments thread local data
 typedef struct mi_segments_tld_s {
   mi_span_queue_t     spans[MI_SEGMENT_BIN_MAX+1];  // free slice spans inside segments
@@ -687,8 +734,8 @@ typedef struct mi_segments_tld_s {
   size_t              current_size; // current size of all segments
   size_t              peak_size;    // peak size of all segments
   size_t              reclaim_count;// number of reclaimed (abandoned) segments
+  mi_subproc_t*       subproc;      // sub-process this thread belongs to.
   mi_stats_t*         stats;        // points to tld stats
-  mi_os_tld_t*        os;           // points to os stats
 } mi_segments_tld_t;
 
 // Thread local data
@@ -698,7 +745,6 @@ struct mi_tld_s {
   mi_heap_t*          heap_backing;  // backing heap of this thread (cannot be deleted)
   mi_heap_t*          heaps;         // list of heaps in this thread (so we can abandon all when the thread terminates)
   mi_segments_tld_t   segments;      // segment tld
-  mi_os_tld_t         os;            // os tld
   mi_stats_t          stats;         // statistics
 };
 
diff --git a/third-party/mimalloc/readme.md b/third-party/mimalloc/readme.md
index a0296b43c3..264da2b56e 100644
--- a/third-party/mimalloc/readme.md
+++ b/third-party/mimalloc/readme.md
@@ -12,8 +12,8 @@ is a general purpose allocator with excellent [performance](#performance) charac
 Initially developed by Daan Leijen for the runtime systems of the
 [Koka](https://koka-lang.github.io) and [Lean](https://github.com/leanprover/lean) languages.
 
-Latest release tag: `v2.1.7` (2024-05-21).  
-Latest v1 tag: `v1.8.7` (2024-05-21).
+Latest release tag: `v2.1.9` (2025-01-03).  
+Latest v1 tag: `v1.8.9` (2024-01-03).
 
 mimalloc is a drop-in replacement for `malloc` and can be used in other programs
 without code changes, for example, on dynamically linked ELF-based systems (Linux, BSD, etc.) you can use it as:
@@ -22,7 +22,7 @@ without code changes, for example, on dynamically linked ELF-based systems (Linu
 ```
 It also includes a robust way to override the default allocator in [Windows](#override_on_windows). Notable aspects of the design include:
 
-- __small and consistent__: the library is about 8k LOC using simple and
+- __small and consistent__: the library is about 10k LOC using simple and
   consistent data structures. This makes it very suitable
   to integrate and adapt in other projects. For runtime systems it
   provides hooks for a monotonic _heartbeat_ and deferred freeing (for
@@ -70,35 +70,40 @@ Enjoy!
 
 ### Branches
 
-* `master`: latest stable release (based on `dev-slice`).
+* `master`: latest stable release (based on `dev2`).
 * `dev`: development branch for mimalloc v1. Use this branch for submitting PR's.
-* `dev-slice`: development branch for mimalloc v2. This branch is downstream of `dev` (and is essentially equal to `dev` except for
-`src/segment.c`)
+* `dev2`: development branch for mimalloc v2. This branch is downstream of `dev` 
+         (and is essentially equal to `dev` except for `src/segment.c`). Uses larger sliced segments to manage
+         mimalloc pages what can reduce fragmentation.
+* `dev3`: development branch for mimalloc v3-alpha. This branch is downstream of `dev`. This is still experimental,
+          but simplifies previous versions by having no segments any more. This improves sharing of memory 
+          between threads, and on certain large workloads uses less memory with less fragmentation.
 
 ### Releases
 
-Note: the `v2.x` version has a different algorithm for managing internal mimalloc pages (as slices) that tends to use reduce 
-memory usage
-  and fragmentation compared to mimalloc `v1.x` (especially for large workloads). Should otherwise have similar performance
-  (see [below](#performance)); please report if you observe any significant performance regression.
-
+* 2025-01-03, `v1.8.9`, `v2.1.9`, `v3.0-alpha`: Interim release. Support Windows arm64. New [guarded](#guarded) build that can place OS 
+  guard pages behind objects to catch buffer overflows as they occur. 
+  Many small fixes: build on Windows arm64, cygwin, riscV, and dragonfly; fix Windows static library initialization to account for
+  thread local destructors (in Rust/C++); macOS tag change; macOS TLS slot fix; improve stats; 
+  consistent mimalloc.dll on Windows (instead of mimalloc-override.dll); fix mimalloc-redirect on Win11 H2; 
+  add 0-byte to canary; upstream CPython fixes; reduce .bss size; allow fixed TLS slot on Windows for improved performance.
 * 2024-05-21, `v1.8.7`, `v2.1.7`: Fix build issues on less common platforms. Started upstreaming patches
   from the CPython [integration](https://github.com/python/cpython/issues/113141#issuecomment-2119255217). Upstream `vcpkg` patches.
 * 2024-05-13, `v1.8.6`, `v2.1.6`: Fix build errors on various (older) platforms. Refactored aligned allocation.
 * 2024-04-22, `v1.8.4`, `v2.1.4`: Fixes various bugs and build issues. Add `MI_LIBC_MUSL` cmake flag for musl builds.
   Free-ing code is refactored into a separate module (`free.c`). Mimalloc page info is simplified with the block size
-  directly available (and new `block_size_shift` to improve aligned block free-ing). 
+  directly available (and new `block_size_shift` to improve aligned block free-ing).
   New approach to collection of abandoned segments: When
   a thread terminates the segments it owns are abandoned (containing still live objects) and these can be
-  reclaimed by other threads. We no longer use a list of abandoned segments but this is now done using bitmaps in arena's 
+  reclaimed by other threads. We no longer use a list of abandoned segments but this is now done using bitmaps in arena's
   which is more concurrent (and more aggressive). Abandoned memory can now also be reclaimed if a thread frees an object in
   an abandoned page (which can be disabled using `mi_option_abandoned_reclaim_on_free`). The option `mi_option_max_segment_reclaim`
   gives a maximum percentage of abandoned segments that can be reclaimed per try (=10%).
 
-* 2023-04-24, `v1.8.2`, `v2.1.2`: Fixes build issues on freeBSD, musl, and C17 (UE 5.1.1). Reduce code size/complexity 
+* 2023-04-24, `v1.8.2`, `v2.1.2`: Fixes build issues on freeBSD, musl, and C17 (UE 5.1.1). Reduce code size/complexity
   by removing regions and segment-cache's and only use arenas with improved memory purging -- this may improve memory
   usage as well for larger services. Renamed options for consistency. Improved Valgrind and ASAN checking.
-  
+
 * 2023-04-03, `v1.8.1`, `v2.1.1`: Fixes build issues on some platforms.
 
 * 2023-03-29, `v1.8.0`, `v2.1.0`: Improved support dynamic overriding on Windows 11. Improved tracing precision
@@ -106,14 +111,14 @@ memory usage
   abstraction layer to make it easier to port and separate platform dependent code (in `src/prim`). Fixed C++ STL compilation on older Microsoft C++ compilers, and various small bug fixes.
 
 * 2022-12-23, `v1.7.9`, `v2.0.9`: Supports building with [asan](#asan) and improved [Valgrind](#valgrind) support.
-  Support arbitrary large alignments (in particular for `std::pmr` pools). 
-  Added C++ STL allocators attached to a specific heap (thanks @vmarkovtsev). 
-  Heap walks now visit all object (including huge objects). Support Windows nano server containers (by Johannes Schindelin,@dscho). 
+  Support arbitrary large alignments (in particular for `std::pmr` pools).
+  Added C++ STL allocators attached to a specific heap (thanks @vmarkovtsev).
+  Heap walks now visit all object (including huge objects). Support Windows nano server containers (by Johannes Schindelin,@dscho).
   Various small bug fixes.
 
 * 2022-11-03, `v1.7.7`, `v2.0.7`: Initial support for [Valgrind](#valgrind) for leak testing and heap block overflow
   detection. Initial
-  support for attaching heaps to a speficic memory area (only in v2). Fix `realloc` behavior for zero size blocks, remove restriction to integral multiple of the alignment in `alloc_align`, improved aligned allocation performance, reduced contention with many threads on few processors (thank you @dposluns!), vs2022 support, support `pkg-config`, .
+  support for attaching heaps to a specific memory area (only in v2). Fix `realloc` behavior for zero size blocks, remove restriction to integral multiple of the alignment in `alloc_align`, improved aligned allocation performance, reduced contention with many threads on few processors (thank you @dposluns!), vs2022 support, support `pkg-config`, .
 
 * 2022-04-14, `v1.7.6`, `v2.0.6`: fix fallback path for aligned OS allocation on Windows, improve Windows aligned allocation
   even when compiling with older SDK's, fix dynamic overriding on macOS Monterey, fix MSVC C++ dynamic overriding, fix
@@ -164,9 +169,9 @@ The `mimalloc` project builds a static library (in `out/msvc-x64`), while the
 `mimalloc-override` project builds a DLL for overriding malloc
 in the entire program.
 
-## macOS, Linux, BSD, etc.
+## Linux, macOS, BSD, etc.
 
-We use [`cmake`](https://cmake.org)<sup>1</sup> as the build system:
+We use [`cmake`](https://cmake.org) as the build system:
 
 ```
 > mkdir -p out/release
@@ -189,24 +194,42 @@ maintains detailed statistics as:
 > cmake -DCMAKE_BUILD_TYPE=Debug ../..
 > make
 ```
+
 This will name the shared library as `libmimalloc-debug.so`.
 
-Finally, you can build a _secure_ version that uses guard pages, encrypted
-free lists, etc., as:
+Finally, you can build a _secure_ version that uses guard pages, encrypted free lists, etc., as:
+
 ```
 > mkdir -p out/secure
 > cd out/secure
 > cmake -DMI_SECURE=ON ../..
 > make
 ```
+
 This will name the shared library as `libmimalloc-secure.so`.
-Use `ccmake`<sup>2</sup> instead of `cmake`
-to see and customize all the available build options.
+Use `cmake ../.. -LH` to see all the available build options.
+
+The examples use the default compiler. If you like to use another, use:
+
+```
+> CC=clang CXX=clang++ cmake ../..
+```
 
-Notes:
-1. Install CMake: `sudo apt-get install cmake`
-2. Install CCMake: `sudo apt-get install cmake-curses-gui`
+## Cmake with Visual Studio
 
+You can also use cmake on Windows. Open a Visual Studio 2022 development prompt 
+and invoke `cmake` with the right [generator](https://cmake.org/cmake/help/latest/generator/Visual%20Studio%2017%202022.html) 
+and architecture, like:
+
+```
+> cmake ..\.. -G "Visual Studio 17 2022" -A x64 -DMI_OVERRIDE=ON
+```
+
+The cmake build type is specified when actually building, for example:
+
+```
+> cmake --build . --config=Release
+```
 
 ## Single source
 
@@ -226,7 +249,7 @@ mimalloc uses only safe OS calls (`mmap` and `VirtualAlloc`) and can co-exist
 with other allocators linked to the same program.
 If you use `cmake`, you can simply use:
 ```
-find_package(mimalloc 1.4 REQUIRED)
+find_package(mimalloc 1.8 REQUIRED)
 ```
 in your `CMakeLists.txt` to find a locally installed mimalloc. Then use either:
 ```
@@ -240,7 +263,7 @@ to link with the static library. See `test\CMakeLists.txt` for an example.
 
 For best performance in C++ programs, it is also recommended to override the
 global `new` and `delete` operators. For convenience, mimalloc provides
-[`mimalloc-new-delete.h`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-new-delete.h) which does this for you -- just include it in a single(!) source file in your project.
+[`mimalloc-new-delete.h`](include/mimalloc-new-delete.h) which does this for you -- just include it in a single(!) source file in your project.
 In C++, mimalloc also provides the `mi_stl_allocator` struct which implements the `std::allocator`
 interface.
 
@@ -295,14 +318,14 @@ You can set further options either programmatically (using [`mi_option_set`](htt
 
 Advanced options:
 
-- `MIMALLOC_ARENA_EAGER_COMMIT=2`: turns on eager commit for the large arenas (usually 1GiB) from which mimalloc 
-   allocates segments and pages. Set this to 2 (default) to 
-   only enable this on overcommit systems (e.g. Linux). Set this to 1 to enable explicitly on other systems 
-   as well (like Windows or macOS) which may improve performance (as the whole arena is committed at once). 
-   Note that eager commit only increases the commit but not the actual the peak resident set 
+- `MIMALLOC_ARENA_EAGER_COMMIT=2`: turns on eager commit for the large arenas (usually 1GiB) from which mimalloc
+   allocates segments and pages. Set this to 2 (default) to
+   only enable this on overcommit systems (e.g. Linux). Set this to 1 to enable explicitly on other systems
+   as well (like Windows or macOS) which may improve performance (as the whole arena is committed at once).
+   Note that eager commit only increases the commit but not the actual the peak resident set
    (rss) so it is generally ok to enable this.
-- `MIMALLOC_PURGE_DELAY=N`: the delay in `N` milli-seconds (by default `10`) after which mimalloc will purge 
-   OS pages that are not in use. This signals to the OS that the underlying physical memory can be reused which 
+- `MIMALLOC_PURGE_DELAY=N`: the delay in `N` milli-seconds (by default `10`) after which mimalloc will purge
+   OS pages that are not in use. This signals to the OS that the underlying physical memory can be reused which
    can reduce memory fragmentation especially in long running (server) programs. Setting `N` to `0` purges immediately when
    a page becomes unused which can improve memory usage but also decreases performance. Setting `N` to a higher
    value like `100` can improve performance (sometimes by a lot) at the cost of potentially using more memory at times.
@@ -310,7 +333,7 @@ Advanced options:
 - `MIMALLOC_PURGE_DECOMMITS=1`: By default "purging" memory means unused memory is decommitted (`MEM_DECOMMIT` on Windows,
    `MADV_DONTNEED` (which decresease rss immediately) on `mmap` systems). Set this to 0 to instead "reset" unused
    memory on a purge (`MEM_RESET` on Windows, generally `MADV_FREE` (which does not decrease rss immediately) on `mmap` systems).
-   Mimalloc generally does not "free" OS memory but only "purges" OS memory, in other words, it tries to keep virtual 
+   Mimalloc generally does not "free" OS memory but only "purges" OS memory, in other words, it tries to keep virtual
    address ranges and decommits within those ranges (to make the underlying physical memory available to other processes).
 
 Further options for large workloads and services:
@@ -319,15 +342,16 @@ Further options for large workloads and services:
    at runtime. Setting `N` to 1 may avoid problems in some virtual environments. Also, setting it to a lower number than
    the actual NUMA nodes is fine and will only cause threads to potentially allocate more memory across actual NUMA
    nodes (but this can happen in any case as NUMA local allocation is always a best effort but not guaranteed).
-- `MIMALLOC_ALLOW_LARGE_OS_PAGES=1`: use large OS pages (2 or 4MiB) when available; for some workloads this can significantly
-   improve performance. When this option is disabled, it also disables transparent huge pages (THP) for the process 
-   (on Linux and Android). Use `MIMALLOC_VERBOSE` to check if the large OS pages are enabled -- usually one needs
+- `MIMALLOC_ALLOW_LARGE_OS_PAGES=0`: Set to 1 to use large OS pages (2 or 4MiB) when available; for some workloads this can significantly
+   improve performance. When this option is disabled (default), it also disables transparent huge pages (THP) for the process
+   (on Linux and Android). On Linux the default setting is 2 -- this enables the use of large pages through THP only.
+   Use `MIMALLOC_VERBOSE` to check if the large OS pages are enabled -- usually one needs
    to explicitly give permissions for large OS pages (as on [Windows][windows-huge] and [Linux][linux-huge]). However, sometimes
    the OS is very slow to reserve contiguous physical memory for large OS pages so use with care on systems that
-   can have fragmented memory (for that reason, we generally recommend to use `MIMALLOC_RESERVE_HUGE_OS_PAGES` instead whenever possible).   
+   can have fragmented memory (for that reason, we generally recommend to use `MIMALLOC_RESERVE_HUGE_OS_PAGES` instead whenever possible).
 - `MIMALLOC_RESERVE_HUGE_OS_PAGES=N`: where `N` is the number of 1GiB _huge_ OS pages. This reserves the huge pages at
    startup and sometimes this can give a large (latency) performance improvement on big workloads.
-   Usually it is better to not use `MIMALLOC_ALLOW_LARGE_OS_PAGES=1` in combination with this setting. Just like large 
+   Usually it is better to not use `MIMALLOC_ALLOW_LARGE_OS_PAGES=1` in combination with this setting. Just like large
    OS pages, use with care as reserving
    contiguous physical memory can take a long time when memory is fragmented (but reserving the huge pages is done at
    startup only once).
@@ -367,13 +391,39 @@ As always, evaluate with care as part of an overall security strategy as all of
 
 ## Debug Mode
 
-When _mimalloc_ is built using debug mode, various checks are done at runtime to catch development errors.
+When _mimalloc_ is built using debug mode, (`-DCMAKE_BUILD_TYPE=Debug`), 
+various checks are done at runtime to catch development errors.
 
 - Statistics are maintained in detail for each object size. They can be shown using `MIMALLOC_SHOW_STATS=1` at runtime.
 - All objects have padding at the end to detect (byte precise) heap block overflows.
 - Double free's, and freeing invalid heap pointers are detected.
 - Corrupted free-lists and some forms of use-after-free are detected.
 
+## Guarded Mode
+
+<span id="guarded">_mimalloc_ can be build in guarded mode using the `-DMI_GUARDED=ON` flags in `cmake`.</span>
+This enables placing OS guard pages behind certain object allocations to catch buffer overflows as they occur.
+This can be invaluable to catch buffer-overflow bugs in large programs. However, it also means that any object
+allocated with a guard page takes at least 8 KiB memory for the guard page and its alignment. As such, allocating
+a guard page for every allocation may be too expensive both in terms of memory, and in terms of performance with
+many system calls. Therefore, there are various environment variables (and options) to tune this:
+
+- `MIMALLOC_GUARDED_SAMPLE_RATE=N`: Set the sample rate to `N` (by default 4000). This mode places a guard page
+  behind every `N` suitable object allocations (per thread). Since the performance in guarded mode without placing
+  guard pages is close to release mode, this can be used to enable guard pages even in production to catch latent 
+  buffer overflow bugs. Set the sample rate to `1` to guard every object, and to `0` to place no guard pages at all.
+
+- `MIMALLOC_GUARDED_SAMPLE_SEED=N`: Start sampling at `N` (by default random). Can be used to reproduce a buffer
+  overflow if needed.
+
+- `MIMALLOC_GUARDED_MIN=N`, `MIMALLOC_GUARDED_MAX=N`: Minimal and maximal _rounded_ object sizes for which a guard 
+  page is considered (`0` and `1GiB` respectively). If you suspect a buffer overflow occurs with an object of size
+  141, set the minimum and maximum to `148` and the sample rate to `1` to have all of those guarded.
+
+- `MIMALLOC_GUARDED_PRECISE=1`: If we have an object of size 13, we would usually place it an aligned 16 bytes in
+  front of the guard page. Using `MIMALLOC_GUARDED_PRECISE` places it exactly 13 bytes before a page so that even
+  a 1 byte overflow is detected. This violates the C/C++ minimal alignment guarantees though so use with care.
+
 
 # Overriding Standard Malloc
 
@@ -415,43 +465,48 @@ Note that certain security restrictions may apply when doing this from
 the [shell](https://stackoverflow.com/questions/43941322/dyld-insert-libraries-ignored-when-calling-application-through-bash).
 
 
-### Dynamic Override on Windows
+# Windows Override
 
 <span id="override_on_windows">Dynamically overriding on mimalloc on Windows</span> 
-is robust and has the particular advantage to be able to redirect all malloc/free calls that go through
-the (dynamic) C runtime allocator, including those from other DLL's or libraries.
-As it intercepts all allocation calls on a low level, it can be used reliably 
+is robust and has the particular advantage to be able to redirect all malloc/free calls 
+that go through the (dynamic) C runtime allocator, including those from other DLL's or 
+libraries. As it intercepts all allocation calls on a low level, it can be used reliably 
 on large programs that include other 3rd party components.
-There are four requirements to make the overriding work robustly:
+There are four requirements to make the overriding work well:
 
 1. Use the C-runtime library as a DLL (using the `/MD` or `/MDd` switch).
-2. Link your program explicitly with `mimalloc-override.dll` library.
-   To ensure the `mimalloc-override.dll` is loaded at run-time it is easiest to insert some
-    call to the mimalloc API in the `main` function, like `mi_version()`
-    (or use the `/INCLUDE:mi_version` switch on the linker). See the `mimalloc-override-test` project
-    for an example on how to use this. 
-3. The [`mimalloc-redirect.dll`](bin) (or `mimalloc-redirect32.dll`) must be put
-   in the same folder as the main `mimalloc-override.dll` at runtime (as it is a dependency of that DLL).
-   The redirection DLL ensures that all calls to the C runtime malloc API get redirected to
-   mimalloc functions (which reside in `mimalloc-override.dll`).
-4. Ensure the `mimalloc-override.dll` comes as early as possible in the import
+
+2. Link your program explicitly with the `mimalloc.lib` export library for the `mimalloc.dll`.
+   (which must be compiled with `-DMI_OVERRIDE=ON`, which is the default though).
+   To ensure the `mimalloc.dll` is actually loaded at run-time it is easiest 
+   to insert some call to the mimalloc API in the `main` function, like `mi_version()`
+   (or use the `/include:mi_version` switch on the linker command, or
+   similarly, `#pragma comment(linker, "/include:mi_version")` in some source file). 
+   See the `mimalloc-test-override` project for an example on how to use this. 
+
+3. The `mimalloc-redirect.dll` must be put in the same directory as the main 
+   `mimalloc.dll` at runtime (as it is a dependency of that DLL).
+   The redirection DLL ensures that all calls to the C runtime malloc API get 
+   redirected to mimalloc functions (which reside in `mimalloc.dll`).
+
+4. Ensure the `mimalloc.dll` comes as early as possible in the import
    list of the final executable (so it can intercept all potential allocations).
+   You can use `minject -l <exe>` to check this if needed.
 
 For best performance on Windows with C++, it
 is also recommended to also override the `new`/`delete` operations (by including
-[`mimalloc-new-delete.h`](include/mimalloc-new-delete.h) 
+[`mimalloc-new-delete.h`](include/mimalloc-new-delete.h)
 a single(!) source file in your project).
 
 The environment variable `MIMALLOC_DISABLE_REDIRECT=1` can be used to disable dynamic
-overriding at run-time. Use `MIMALLOC_VERBOSE=1` to check if mimalloc was successfully redirected.
+overriding at run-time. Use `MIMALLOC_VERBOSE=1` to check if mimalloc was successfully 
+redirected.
+
+For different platforms than x64, you may need a specific [redirection dll](bin).
+Furthermore, we cannot always re-link an executable or ensure `mimalloc.dll` comes
+first in the import table. In such cases the [`minject`](bin) tool can be used
+to patch the executable's import tables.
 
-We cannot always re-link an executable with `mimalloc-override.dll`, and similarly, we cannot always
-ensure the the DLL comes first in the import table of the final executable.
-In many cases though we can patch existing executables without any recompilation
-if they are linked with the dynamic C runtime (`ucrtbase.dll`) -- just put the `mimalloc-override.dll`
-into the import table (and put `mimalloc-redirect.dll` in the same folder)
-Such patching can be done for example with [CFF Explorer](https://ntcore.com/?page_id=388) or
-the [`minject`](bin) program.
 
 ## Static override
 
@@ -462,6 +517,7 @@ an object file instead of a library file as linkers give preference to
 that over archives to resolve symbols. To ensure that the standard
 malloc interface resolves to the _mimalloc_ library, link it as the first
 object file. For example:
+
 ```
 > gcc -o myprogram mimalloc.o  myfile1.c ...
 ```
@@ -469,16 +525,17 @@ object file. For example:
 Another way to override statically that works on all platforms, is to
 link statically to mimalloc (as shown in the introduction) and include a
 header file in each source file that re-defines `malloc` etc. to `mi_malloc`.
-This is provided by [`mimalloc-override.h`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-override.h). This only works reliably though if all sources are
+This is provided by [`mimalloc-override.h`](include/mimalloc-override.h). This only works 
+reliably though if all sources are
 under your control or otherwise mixing of pointers from different heaps may occur!
 
 
 # Tools
 
 Generally, we recommend using the standard allocator with memory tracking tools, but mimalloc
-can also be build to support the [address sanitizer][asan] or the excellent [Valgrind] tool. 
+can also be build to support the [address sanitizer][asan] or the excellent [Valgrind] tool.
 Moreover, it can be build to support Windows event tracing ([ETW]).
-This has a small performance overhead but does allow detecting memory leaks and byte-precise 
+This has a small performance overhead but does allow detecting memory leaks and byte-precise
 buffer overflows directly on final executables. See also the `test/test-wrong.c` file to test with various tools.
 
 ## Valgrind
@@ -505,9 +562,13 @@ you also need to tell `valgrind` to not intercept those calls itself, and use:
 
 By setting the `MIMALLOC_SHOW_STATS` environment variable you can check that mimalloc is indeed
 used and not the standard allocator. Even though the [Valgrind option][valgrind-soname]
-is called `--soname-synonyms`, this also
-works when overriding with a static library or object file. Unfortunately, it is not possible to
-dynamically override mimalloc using `LD_PRELOAD` together with `valgrind`.
+is called `--soname-synonyms`, this also works when overriding with a static library or object file.
+To dynamically override mimalloc using `LD_PRELOAD` together with `valgrind`, use:
+
+```
+> valgrind --trace-children=yes --soname-synonyms=somalloc=*mimalloc* /usr/bin/env LD_PRELOAD=/usr/lib/libmimalloc.so -- <myprogram>
+```
+
 See also the `test/test-wrong.c` file to test with `valgrind`.
 
 Valgrind support is in its initial development -- please report any issues.
@@ -523,7 +584,7 @@ To build with the address sanitizer, use the `-DMI_TRACK_ASAN=ON` cmake option:
 > cmake ../.. -DMI_TRACK_ASAN=ON
 ```
 
-This can also be combined with secure mode or debug mode. 
+This can also be combined with secure mode or debug mode.
 You can then run your programs as:'
 
 ```
@@ -531,7 +592,7 @@ You can then run your programs as:'
 ```
 
 When you link a program with an address sanitizer build of mimalloc, you should
-generally compile that program too with the address sanitizer enabled. 
+generally compile that program too with the address sanitizer enabled.
 For example, assuming you build mimalloc in `out/debug`:
 
 ```
@@ -540,23 +601,23 @@ clang -g -o test-wrong -Iinclude test/test-wrong.c out/debug/libmimalloc-asan-de
 
 Since the address sanitizer redirects the standard allocation functions, on some platforms (macOSX for example)
 it is required to compile mimalloc with `-DMI_OVERRIDE=OFF`.
-Adress sanitizer support is in its initial development -- please report any issues.
+Address sanitizer support is in its initial development -- please report any issues.
 
 [asan]: https://github.com/google/sanitizers/wiki/AddressSanitizer
 
 ## ETW
 
 Event tracing for Windows ([ETW]) provides a high performance way to capture all allocations though
-mimalloc and analyze them later. To build with ETW support, use the `-DMI_TRACK_ETW=ON` cmake option. 
+mimalloc and analyze them later. To build with ETW support, use the `-DMI_TRACK_ETW=ON` cmake option.
 
-You can then capture an allocation trace using the Windows performance recorder (WPR), using the 
+You can then capture an allocation trace using the Windows performance recorder (WPR), using the
 `src/prim/windows/etw-mimalloc.wprp` profile. In an admin prompt, you can use:
 ```
 > wpr -start src\prim\windows\etw-mimalloc.wprp -filemode
 > <my_mimalloc_program>
 > wpr -stop <my_mimalloc_program>.etl
-``` 
-and then open `<my_mimalloc_program>.etl` in the Windows Performance Analyzer (WPA), or 
+```
+and then open `<my_mimalloc_program>.etl` in the Windows Performance Analyzer (WPA), or
 use a tool like [TraceControl] that is specialized for analyzing mimalloc traces.
 
 [ETW]: https://learn.microsoft.com/en-us/windows-hardware/test/wpt/event-tracing-for-windows
diff --git a/third-party/mimalloc/src/alloc-aligned.c b/third-party/mimalloc/src/alloc-aligned.c
index ba629ef30a..8bf0a38dbb 100644
--- a/third-party/mimalloc/src/alloc-aligned.c
+++ b/third-party/mimalloc/src/alloc-aligned.c
@@ -24,6 +24,33 @@ static bool mi_malloc_is_naturally_aligned( size_t size, size_t alignment ) {
   return (bsize <= MI_MAX_ALIGN_GUARANTEE && (bsize & (alignment-1)) == 0);
 }
 
+#if MI_GUARDED
+static mi_decl_restrict void* mi_heap_malloc_guarded_aligned(mi_heap_t* heap, size_t size, size_t alignment, bool zero) mi_attr_noexcept {
+  // use over allocation for guarded blocksl
+  mi_assert_internal(alignment > 0 && alignment < MI_BLOCK_ALIGNMENT_MAX);
+  const size_t oversize = size + alignment - 1;
+  void* base = _mi_heap_malloc_guarded(heap, oversize, zero);
+  void* p = mi_align_up_ptr(base, alignment);
+  mi_track_align(base, p, (uint8_t*)p - (uint8_t*)base, size);
+  mi_assert_internal(mi_usable_size(p) >= size);
+  mi_assert_internal(_mi_is_aligned(p, alignment));
+  return p;
+}
+
+static void* mi_heap_malloc_zero_no_guarded(mi_heap_t* heap, size_t size, bool zero) {
+  const size_t rate = heap->guarded_sample_rate;
+  // only write if `rate!=0` so we don't write to the constant `_mi_heap_empty`
+  if (rate != 0) { heap->guarded_sample_rate = 0; }
+  void* p = _mi_heap_malloc_zero(heap, size, zero);
+  if (rate != 0) { heap->guarded_sample_rate = rate; }
+  return p;
+}
+#else
+static void* mi_heap_malloc_zero_no_guarded(mi_heap_t* heap, size_t size, bool zero) {
+  return _mi_heap_malloc_zero(heap, size, zero);
+}
+#endif
+
 // Fallback aligned allocation that over-allocates -- split out for better codegen
 static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept
 {
@@ -38,22 +65,24 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t
     // first (and single) page such that the segment info is `MI_SEGMENT_SIZE` bytes before it (so it can be found by aligning the pointer down)
     if mi_unlikely(offset != 0) {
       // todo: cannot support offset alignment for very large alignments yet
-      #if MI_DEBUG > 0
+#if MI_DEBUG > 0
       _mi_error_message(EOVERFLOW, "aligned allocation with a very large alignment cannot be used with an alignment offset (size %zu, alignment %zu, offset %zu)\n", size, alignment, offset);
-      #endif
+#endif
       return NULL;
     }
     oversize = (size <= MI_SMALL_SIZE_MAX ? MI_SMALL_SIZE_MAX + 1 /* ensure we use generic malloc path */ : size);
+    // note: no guarded as alignment > 0
     p = _mi_heap_malloc_zero_ex(heap, oversize, false, alignment); // the page block size should be large enough to align in the single huge page block
     // zero afterwards as only the area from the aligned_p may be committed!
     if (p == NULL) return NULL;
   }
   else {
     // otherwise over-allocate
-    oversize = size + alignment - 1;
-    p = _mi_heap_malloc_zero(heap, oversize, zero);
+    oversize = (size < MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : size) + alignment - 1;  // adjust for size <= 16; with size 0 and aligment 64k, we would allocate a 64k block and pointing just beyond that.
+    p = mi_heap_malloc_zero_no_guarded(heap, oversize, zero);
     if (p == NULL) return NULL;
   }
+  mi_page_t* page = _mi_ptr_page(p);
 
   // .. and align within the allocation
   const uintptr_t align_mask = alignment - 1;  // for any x, `(x & align_mask) == (x % alignment)`
@@ -62,17 +91,27 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t
   mi_assert_internal(adjust < alignment);
   void* aligned_p = (void*)((uintptr_t)p + adjust);
   if (aligned_p != p) {
-    mi_page_t* page = _mi_ptr_page(p);
     mi_page_set_has_aligned(page, true);
+    #if MI_GUARDED
+    // set tag to aligned so mi_usable_size works with guard pages
+    if (adjust >= sizeof(mi_block_t)) {
+      mi_block_t* const block = (mi_block_t*)p;
+      block->next = MI_BLOCK_TAG_ALIGNED;
+    }
+    #endif
     _mi_padding_shrink(page, (mi_block_t*)p, adjust + size);
   }
   // todo: expand padding if overallocated ?
 
-  mi_assert_internal(mi_page_usable_block_size(_mi_ptr_page(p)) >= adjust + size);
-  mi_assert_internal(p == _mi_page_ptr_unalign(_mi_ptr_page(aligned_p), aligned_p));
+  mi_assert_internal(mi_page_usable_block_size(page) >= adjust + size);
   mi_assert_internal(((uintptr_t)aligned_p + offset) % alignment == 0);
   mi_assert_internal(mi_usable_size(aligned_p)>=size);
   mi_assert_internal(mi_usable_size(p) == mi_usable_size(aligned_p)+adjust);
+  #if MI_DEBUG > 1
+  mi_page_t* const apage = _mi_ptr_page(aligned_p);
+  void* unalign_p = _mi_page_ptr_unalign(apage, aligned_p);
+  mi_assert_internal(p == unalign_p);
+  #endif
 
   // now zero the block if needed
   if (alignment > MI_BLOCK_ALIGNMENT_MAX) {
@@ -85,6 +124,9 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t
 
   if (p != aligned_p) {
     mi_track_align(p,aligned_p,adjust,mi_usable_size(aligned_p));
+    #if MI_GUARDED
+    mi_track_mem_defined(p, sizeof(mi_block_t));
+    #endif
   }
   return aligned_p;
 }
@@ -94,27 +136,27 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_generic(mi_heap_t*
 {
   mi_assert_internal(alignment != 0 && _mi_is_power_of_two(alignment));
   // we don't allocate more than MI_MAX_ALLOC_SIZE (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
-  if mi_unlikely(size > (MI_MAX_ALLOC_SIZE - MI_PADDING_SIZE)) { 
+  if mi_unlikely(size > (MI_MAX_ALLOC_SIZE - MI_PADDING_SIZE)) {
     #if MI_DEBUG > 0
     _mi_error_message(EOVERFLOW, "aligned allocation request is too large (size %zu, alignment %zu)\n", size, alignment);
     #endif
     return NULL;
   }
-  
+
   // use regular allocation if it is guaranteed to fit the alignment constraints.
   // this is important to try as the fast path in `mi_heap_malloc_zero_aligned` only works when there exist
   // a page with the right block size, and if we always use the over-alloc fallback that would never happen.
   if (offset == 0 && mi_malloc_is_naturally_aligned(size,alignment)) {
-    void* p = _mi_heap_malloc_zero(heap, size, zero);
+    void* p = mi_heap_malloc_zero_no_guarded(heap, size, zero);
     mi_assert_internal(p == NULL || ((uintptr_t)p % alignment) == 0);
-    const bool is_aligned_or_null = (((uintptr_t)p) & (alignment-1))==0;  
+    const bool is_aligned_or_null = (((uintptr_t)p) & (alignment-1))==0;
     if mi_likely(is_aligned_or_null) {
       return p;
     }
     else {
       // this should never happen if the `mi_malloc_is_naturally_aligned` check is correct..
       mi_assert(false);
-      mi_free(p); 
+      mi_free(p);
     }
   }
 
@@ -122,6 +164,7 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_generic(mi_heap_t*
   return mi_heap_malloc_zero_aligned_at_overalloc(heap,size,alignment,offset,zero);
 }
 
+
 // Primitive aligned allocation
 static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept
 {
@@ -132,11 +175,17 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
     #endif
     return NULL;
   }
-  
+
+  #if MI_GUARDED
+  if (offset==0 && alignment < MI_BLOCK_ALIGNMENT_MAX && mi_heap_malloc_use_guarded(heap,size)) {
+    return mi_heap_malloc_guarded_aligned(heap, size, alignment, zero);
+  }
+  #endif
+
   // try first if there happens to be a small block available with just the right alignment
   if mi_likely(size <= MI_SMALL_SIZE_MAX && alignment <= size) {
     const uintptr_t align_mask = alignment-1;       // for any x, `(x & align_mask) == (x % alignment)`
-    const size_t padsize = size + MI_PADDING_SIZE;  
+    const size_t padsize = size + MI_PADDING_SIZE;
     mi_page_t* page = _mi_heap_get_free_small_page(heap, padsize);
     if mi_likely(page->free != NULL) {
       const bool is_aligned = (((uintptr_t)page->free + offset) & align_mask)==0;
@@ -310,3 +359,5 @@ mi_decl_nodiscard void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t
 mi_decl_nodiscard void* mi_recalloc_aligned(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
   return mi_heap_recalloc_aligned(mi_prim_get_default_heap(), p, newcount, size, alignment);
 }
+
+
diff --git a/third-party/mimalloc/src/alloc-override.c b/third-party/mimalloc/src/alloc-override.c
index 12837cdd94..b5109ded0a 100644
--- a/third-party/mimalloc/src/alloc-override.c
+++ b/third-party/mimalloc/src/alloc-override.c
@@ -248,7 +248,7 @@ extern "C" {
   // Forward Posix/Unix calls as well
   void*  reallocf(void* p, size_t newsize) MI_FORWARD2(mi_reallocf,p,newsize)
   size_t malloc_size(const void* p)        MI_FORWARD1(mi_usable_size,p)
-  #if !defined(__ANDROID__) && !defined(__FreeBSD__)
+  #if !defined(__ANDROID__) && !defined(__FreeBSD__) && !defined(__DragonFly__)
   size_t malloc_usable_size(void *p)       MI_FORWARD1(mi_usable_size,p)
   #else
   size_t malloc_usable_size(const void *p) MI_FORWARD1(mi_usable_size,p)
@@ -289,8 +289,8 @@ mi_decl_weak int reallocarr(void* p, size_t count, size_t size)    { return mi_r
   void  __libc_free(void* p)                            MI_FORWARD0(mi_free, p)
   void* __libc_memalign(size_t alignment, size_t size)  { return mi_memalign(alignment, size); }
 
-#elif defined(__GLIBC__) && defined(__linux__)
-  // forward __libc interface (needed for glibc-based Linux distributions)
+#elif defined(__linux__)
+  // forward __libc interface (needed for glibc-based and musl-based Linux distributions)
   void* __libc_malloc(size_t size)                      MI_FORWARD1(mi_malloc,size)
   void* __libc_calloc(size_t count, size_t size)        MI_FORWARD2(mi_calloc,count,size)
   void* __libc_realloc(void* p, size_t size)            MI_FORWARD2(mi_realloc,p,size)
diff --git a/third-party/mimalloc/src/alloc.c b/third-party/mimalloc/src/alloc.c
index 86aaae757b..ffa7b8b70d 100644
--- a/third-party/mimalloc/src/alloc.c
+++ b/third-party/mimalloc/src/alloc.c
@@ -28,20 +28,25 @@ terms of the MIT license. A copy of the license can be found in the file
 // Fast allocation in a page: just pop from the free list.
 // Fall back to generic allocation only if the list is empty.
 // Note: in release mode the (inlined) routine is about 7 instructions with a single test.
-extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept 
+extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept
 {
   mi_assert_internal(page->block_size == 0 /* empty heap */ || mi_page_block_size(page) >= size);
+
+  // check the free list
   mi_block_t* const block = page->free;
   if mi_unlikely(block == NULL) {
     return _mi_malloc_generic(heap, size, zero, 0);
   }
   mi_assert_internal(block != NULL && _mi_ptr_page(block) == page);
+
   // pop from the free list
   page->free = mi_block_next(page, block);
   page->used++;
   mi_assert_internal(page->free == NULL || _mi_ptr_page(page->free) == page);
+  mi_assert_internal(page->block_size < MI_MAX_ALIGN_SIZE || _mi_is_aligned(block, MI_MAX_ALIGN_SIZE));
+
   #if MI_DEBUG>3
-  if (page->free_is_zero) {
+  if (page->free_is_zero && size > sizeof(*block)) {
     mi_assert_expensive(mi_mem_is_zero(block+1,size - sizeof(*block)));
   }
   #endif
@@ -54,7 +59,10 @@ extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_
   // zero the block? note: we need to zero the full block size (issue #63)
   if mi_unlikely(zero) {
     mi_assert_internal(page->block_size != 0); // do not call with zero'ing for huge blocks (see _mi_malloc_generic)
+    mi_assert_internal(!mi_page_is_huge(page));
+    #if MI_PADDING
     mi_assert_internal(page->block_size >= MI_PADDING_SIZE);
+    #endif
     if (page->free_is_zero) {
       block->next = 0;
       mi_track_mem_defined(block, page->block_size - MI_PADDING_SIZE);
@@ -91,7 +99,7 @@ extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_
     mi_assert_internal(delta >= 0 && mi_page_usable_block_size(page) >= (size - MI_PADDING_SIZE + delta));
     #endif
     mi_track_mem_defined(padding,sizeof(mi_padding_t));  // note: re-enable since mi_page_usable_block_size may set noaccess
-    padding->canary = (uint32_t)(mi_ptr_encode(page,block,page->keys));
+    padding->canary = mi_ptr_encode_canary(page,block,page->keys);
     padding->delta  = (uint32_t)(delta);
     #if MI_PADDING_CHECK
     if (!mi_page_is_huge(page)) {
@@ -113,19 +121,29 @@ extern void* _mi_page_malloc_zeroed(mi_heap_t* heap, mi_page_t* page, size_t siz
   return _mi_page_malloc_zero(heap,page,size,true);
 }
 
+#if MI_GUARDED
+mi_decl_restrict void* _mi_heap_malloc_guarded(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept;
+#endif
+
 static inline mi_decl_restrict void* mi_heap_malloc_small_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept {
   mi_assert(heap != NULL);
+  mi_assert(size <= MI_SMALL_SIZE_MAX);
   #if MI_DEBUG
   const uintptr_t tid = _mi_thread_id();
   mi_assert(heap->thread_id == 0 || heap->thread_id == tid); // heaps are thread local
   #endif
-  mi_assert(size <= MI_SMALL_SIZE_MAX);
-  #if (MI_PADDING)
+  #if (MI_PADDING || MI_GUARDED)
   if (size == 0) { size = sizeof(void*); }
   #endif
+  #if MI_GUARDED
+  if (mi_heap_malloc_use_guarded(heap,size)) {
+    return _mi_heap_malloc_guarded(heap, size, zero);
+  }
+  #endif
 
+  // get page in constant time, and allocate from it
   mi_page_t* page = _mi_heap_get_free_small_page(heap, size + MI_PADDING_SIZE);
-  void* const p = _mi_page_malloc_zero(heap, page, size + MI_PADDING_SIZE, zero);  
+  void* const p = _mi_page_malloc_zero(heap, page, size + MI_PADDING_SIZE, zero);
   mi_track_malloc(p,size,zero);
 
   #if MI_STAT>1
@@ -153,15 +171,23 @@ mi_decl_nodiscard extern inline mi_decl_restrict void* mi_malloc_small(size_t si
 
 // The main allocation function
 extern inline void* _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept {
+  // fast path for small objects
   if mi_likely(size <= MI_SMALL_SIZE_MAX) {
     mi_assert_internal(huge_alignment == 0);
     return mi_heap_malloc_small_zero(heap, size, zero);
   }
+  #if MI_GUARDED
+  else if (huge_alignment==0 && mi_heap_malloc_use_guarded(heap,size)) {
+    return _mi_heap_malloc_guarded(heap, size, zero);
+  }
+  #endif
   else {
+    // regular allocation
     mi_assert(heap!=NULL);
     mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id());   // heaps are thread local
     void* const p = _mi_malloc_generic(heap, size + MI_PADDING_SIZE, zero, huge_alignment);  // note: size can overflow but it is detected in malloc_generic
     mi_track_malloc(p,size,zero);
+
     #if MI_STAT>1
     if (p != NULL) {
       if (!mi_heap_is_initialized(heap)) { heap = mi_prim_get_default_heap(); }
@@ -362,7 +388,7 @@ mi_decl_nodiscard mi_decl_restrict char* mi_strndup(const char* s, size_t n) mi_
 #ifndef PATH_MAX
 #define PATH_MAX MAX_PATH
 #endif
-#include <windows.h>
+
 mi_decl_nodiscard mi_decl_restrict char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept {
   // todo: use GetFullPathNameW to allow longer file names
   char buf[PATH_MAX];
@@ -530,7 +556,7 @@ mi_decl_nodiscard mi_decl_restrict void* mi_heap_alloc_new_n(mi_heap_t* heap, si
 }
 
 mi_decl_nodiscard mi_decl_restrict void* mi_new_n(size_t count, size_t size) {
-  return mi_heap_alloc_new_n(mi_prim_get_default_heap(), size, count);
+  return mi_heap_alloc_new_n(mi_prim_get_default_heap(), count, size);
 }
 
 
@@ -577,6 +603,82 @@ mi_decl_nodiscard void* mi_new_reallocn(void* p, size_t newcount, size_t size) {
   }
 }
 
+#if MI_GUARDED
+// We always allocate a guarded allocation at an offset (`mi_page_has_aligned` will be true).
+// We then set the first word of the block to `0` for regular offset aligned allocations (in `alloc-aligned.c`)
+// and the first word to `~0` for guarded allocations to have a correct `mi_usable_size`
+
+static void* mi_block_ptr_set_guarded(mi_block_t* block, size_t obj_size) {
+  // TODO: we can still make padding work by moving it out of the guard page area
+  mi_page_t* const page = _mi_ptr_page(block);
+  mi_page_set_has_aligned(page, true);
+  block->next = MI_BLOCK_TAG_GUARDED;
+
+  // set guard page at the end of the block
+  mi_segment_t* const segment = _mi_page_segment(page);
+  const size_t block_size = mi_page_block_size(page);  // must use `block_size` to match `mi_free_local`
+  const size_t os_page_size = _mi_os_page_size();
+  mi_assert_internal(block_size >= obj_size + os_page_size + sizeof(mi_block_t));
+  if (block_size < obj_size + os_page_size + sizeof(mi_block_t)) {
+    // should never happen
+    mi_free(block);
+    return NULL;
+  }
+  uint8_t* guard_page = (uint8_t*)block + block_size - os_page_size;
+  mi_assert_internal(_mi_is_aligned(guard_page, os_page_size));
+  if (segment->allow_decommit && _mi_is_aligned(guard_page, os_page_size)) {
+    _mi_os_protect(guard_page, os_page_size);
+  }
+  else {
+    _mi_warning_message("unable to set a guard page behind an object due to pinned memory (large OS pages?) (object %p of size %zu)\n", block, block_size);
+  }
+
+  // align pointer just in front of the guard page
+  size_t offset = block_size - os_page_size - obj_size;
+  mi_assert_internal(offset > sizeof(mi_block_t));
+  if (offset > MI_BLOCK_ALIGNMENT_MAX) {
+    // give up to place it right in front of the guard page if the offset is too large for unalignment
+    offset = MI_BLOCK_ALIGNMENT_MAX;
+  }
+  void* p = (uint8_t*)block + offset;  
+  mi_track_align(block, p, offset, obj_size);
+  mi_track_mem_defined(block, sizeof(mi_block_t));
+  return p;
+}
+
+mi_decl_restrict void* _mi_heap_malloc_guarded(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept
+{
+  #if defined(MI_PADDING_SIZE)
+  mi_assert(MI_PADDING_SIZE==0);
+  #endif
+  // allocate multiple of page size ending in a guard page
+  // ensure minimal alignment requirement?
+  const size_t os_page_size = _mi_os_page_size();
+  const size_t obj_size = (mi_option_is_enabled(mi_option_guarded_precise) ? size : _mi_align_up(size, MI_MAX_ALIGN_SIZE));
+  const size_t bsize    = _mi_align_up(_mi_align_up(obj_size, MI_MAX_ALIGN_SIZE) + sizeof(mi_block_t), MI_MAX_ALIGN_SIZE);
+  const size_t req_size = _mi_align_up(bsize + os_page_size, os_page_size);
+  mi_block_t* const block = (mi_block_t*)_mi_malloc_generic(heap, req_size, zero, 0 /* huge_alignment */);
+  if (block==NULL) return NULL;
+  void* const p   = mi_block_ptr_set_guarded(block, obj_size);
+
+  // stats
+  mi_track_malloc(p, size, zero);  
+  if (p != NULL) {
+    if (!mi_heap_is_initialized(heap)) { heap = mi_prim_get_default_heap(); }
+    #if MI_STAT>1
+    mi_heap_stat_increase(heap, malloc, mi_usable_size(p));
+    #endif
+    _mi_stat_counter_increase(&heap->tld->stats.guarded_alloc_count, 1);
+  }
+  #if MI_DEBUG>3
+  if (p != NULL && zero) {
+    mi_assert_expensive(mi_mem_is_zero(p, size));
+  }
+  #endif
+  return p;
+}
+#endif
+
 // ------------------------------------------------------
 // ensure explicit external inline definitions are emitted!
 // ------------------------------------------------------
@@ -584,6 +686,7 @@ mi_decl_nodiscard void* mi_new_reallocn(void* p, size_t newcount, size_t size) {
 #ifdef __cplusplus
 void* _mi_externs[] = {
   (void*)&_mi_page_malloc,
+  (void*)&_mi_page_malloc_zero,
   (void*)&_mi_heap_malloc_zero,
   (void*)&_mi_heap_malloc_zero_ex,
   (void*)&mi_malloc,
diff --git a/third-party/mimalloc/src/arena-abandon.c b/third-party/mimalloc/src/arena-abandon.c
new file mode 100644
index 0000000000..460c80fc22
--- /dev/null
+++ b/third-party/mimalloc/src/arena-abandon.c
@@ -0,0 +1,346 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2024, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+#if !defined(MI_IN_ARENA_C)
+#error "this file should be included from 'arena.c' (so mi_arena_t is visible)"
+// add includes help an IDE
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "bitmap.h"
+#endif
+
+// Minimal exports for arena-abandoned.
+size_t      mi_arena_id_index(mi_arena_id_t id);
+mi_arena_t* mi_arena_from_index(size_t idx);
+size_t      mi_arena_get_count(void);
+void*       mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex);
+bool        mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index);
+
+/* -----------------------------------------------------------
+  Abandoned blocks/segments:
+
+  _mi_arena_segment_clear_abandoned
+  _mi_arena_segment_mark_abandoned
+
+  This is used to atomically abandon/reclaim segments
+  (and crosses the arena API but it is convenient to have here).
+
+  Abandoned segments still have live blocks; they get reclaimed
+  when a thread frees a block in it, or when a thread needs a fresh
+  segment.
+
+  Abandoned segments are atomically marked in the `block_abandoned`
+  bitmap of arenas. Any segments allocated outside arenas are put
+  in the sub-process `abandoned_os_list`. This list is accessed
+  using locks but this should be uncommon and generally uncontended.
+  Reclaim and visiting either scan through the `block_abandoned`
+  bitmaps of the arena's, or visit the `abandoned_os_list`
+
+  A potentially nicer design is to use arena's for everything
+  and perhaps have virtual arena's to map OS allocated memory
+  but this would lack the "density" of our current arena's. TBC.
+----------------------------------------------------------- */
+
+
+// reclaim a specific OS abandoned segment; `true` on success.
+// sets the thread_id.
+static bool mi_arena_segment_os_clear_abandoned(mi_segment_t* segment, bool take_lock) {
+  mi_assert(segment->memid.memkind != MI_MEM_ARENA);
+  // not in an arena, remove from list of abandoned os segments
+  mi_subproc_t* const subproc = segment->subproc;
+  if (take_lock && !mi_lock_try_acquire(&subproc->abandoned_os_lock)) {
+    return false;  // failed to acquire the lock, we just give up
+  }
+  // remove atomically from the abandoned os list (if possible!)
+  bool reclaimed = false;
+  mi_segment_t* const next = segment->abandoned_os_next;
+  mi_segment_t* const prev = segment->abandoned_os_prev;
+  if (next != NULL || prev != NULL || subproc->abandoned_os_list == segment) {
+    #if MI_DEBUG>3
+    // find ourselves in the abandoned list (and check the count)
+    bool found = false;
+    size_t count = 0;
+    for (mi_segment_t* current = subproc->abandoned_os_list; current != NULL; current = current->abandoned_os_next) {
+      if (current == segment) { found = true; }
+      count++;
+    }
+    mi_assert_internal(found);
+    mi_assert_internal(count == mi_atomic_load_relaxed(&subproc->abandoned_os_list_count));
+    #endif
+    // remove (atomically) from the list and reclaim
+    if (prev != NULL) { prev->abandoned_os_next = next; }
+    else { subproc->abandoned_os_list = next; }
+    if (next != NULL) { next->abandoned_os_prev = prev; }
+    else { subproc->abandoned_os_list_tail = prev; }
+    segment->abandoned_os_next = NULL;
+    segment->abandoned_os_prev = NULL;
+    mi_atomic_decrement_relaxed(&subproc->abandoned_count);
+    mi_atomic_decrement_relaxed(&subproc->abandoned_os_list_count);
+    if (take_lock) { // don't reset the thread_id when iterating
+      mi_atomic_store_release(&segment->thread_id, _mi_thread_id());
+    }
+    reclaimed = true;
+  }
+  if (take_lock) { mi_lock_release(&segment->subproc->abandoned_os_lock); }
+  return reclaimed;
+}
+
+// reclaim a specific abandoned segment; `true` on success.
+// sets the thread_id.
+bool _mi_arena_segment_clear_abandoned(mi_segment_t* segment) {
+  if mi_unlikely(segment->memid.memkind != MI_MEM_ARENA) {
+    return mi_arena_segment_os_clear_abandoned(segment, true /* take lock */);
+  }
+  // arena segment: use the blocks_abandoned bitmap.
+  size_t arena_idx;
+  size_t bitmap_idx;
+  mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx);
+  mi_arena_t* arena = mi_arena_from_index(arena_idx);
+  mi_assert_internal(arena != NULL);
+  // reclaim atomically
+  bool was_marked = _mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx);
+  if (was_marked) {
+    mi_assert_internal(mi_atomic_load_acquire(&segment->thread_id) == 0);
+    mi_atomic_decrement_relaxed(&segment->subproc->abandoned_count);
+    mi_atomic_store_release(&segment->thread_id, _mi_thread_id());
+  }
+  // mi_assert_internal(was_marked);
+  mi_assert_internal(!was_marked || _mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
+  //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
+  return was_marked;
+}
+
+
+// mark a specific OS segment as abandoned
+static void mi_arena_segment_os_mark_abandoned(mi_segment_t* segment) {
+  mi_assert(segment->memid.memkind != MI_MEM_ARENA);
+  // not in an arena; we use a list of abandoned segments
+  mi_subproc_t* const subproc = segment->subproc;
+  mi_lock(&subproc->abandoned_os_lock) {
+    // push on the tail of the list (important for the visitor)
+    mi_segment_t* prev = subproc->abandoned_os_list_tail;
+    mi_assert_internal(prev == NULL || prev->abandoned_os_next == NULL);
+    mi_assert_internal(segment->abandoned_os_prev == NULL);
+    mi_assert_internal(segment->abandoned_os_next == NULL);
+    if (prev != NULL) { prev->abandoned_os_next = segment; }
+    else { subproc->abandoned_os_list = segment; }
+    subproc->abandoned_os_list_tail = segment;
+    segment->abandoned_os_prev = prev;
+    segment->abandoned_os_next = NULL;
+    mi_atomic_increment_relaxed(&subproc->abandoned_os_list_count);
+    mi_atomic_increment_relaxed(&subproc->abandoned_count);
+    // and release the lock
+  }
+  return;
+}
+
+// mark a specific segment as abandoned
+// clears the thread_id.
+void _mi_arena_segment_mark_abandoned(mi_segment_t* segment)
+{
+  mi_assert_internal(segment->used == segment->abandoned);
+  mi_atomic_store_release(&segment->thread_id, (uintptr_t)0);  // mark as abandoned for multi-thread free's
+  if mi_unlikely(segment->memid.memkind != MI_MEM_ARENA) {
+    mi_arena_segment_os_mark_abandoned(segment);
+    return;
+  }
+  // segment is in an arena, mark it in the arena `blocks_abandoned` bitmap
+  size_t arena_idx;
+  size_t bitmap_idx;
+  mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx);
+  mi_arena_t* arena = mi_arena_from_index(arena_idx);
+  mi_assert_internal(arena != NULL);
+  // set abandonment atomically
+  mi_subproc_t* const subproc = segment->subproc; // don't access the segment after setting it abandoned
+  const bool was_unmarked = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL);
+  if (was_unmarked) { mi_atomic_increment_relaxed(&subproc->abandoned_count); }
+  mi_assert_internal(was_unmarked);
+  mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
+}
+
+
+/* -----------------------------------------------------------
+  Iterate through the abandoned blocks/segments using a cursor.
+  This is used for reclaiming and abandoned block visiting.
+----------------------------------------------------------- */
+
+// start a cursor at a randomized arena
+void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, bool visit_all, mi_arena_field_cursor_t* current) {
+  mi_assert_internal(heap == NULL || heap->tld->segments.subproc == subproc);
+  current->bitmap_idx = 0;
+  current->subproc = subproc;
+  current->visit_all = visit_all;
+  current->hold_visit_lock = false;
+  const size_t abandoned_count = mi_atomic_load_relaxed(&subproc->abandoned_count);
+  const size_t abandoned_list_count = mi_atomic_load_relaxed(&subproc->abandoned_os_list_count);
+  const size_t max_arena = mi_arena_get_count();
+  if (heap != NULL && heap->arena_id != _mi_arena_id_none()) {
+    // for a heap that is bound to one arena, only visit that arena
+    current->start = mi_arena_id_index(heap->arena_id);
+    current->end = current->start + 1;
+    current->os_list_count = 0;
+  }
+  else {
+    // otherwise visit all starting at a random location
+    if (abandoned_count > abandoned_list_count && max_arena > 0) {
+      current->start = (heap == NULL || max_arena == 0 ? 0 : (mi_arena_id_t)(_mi_heap_random_next(heap) % max_arena));
+      current->end = current->start + max_arena;
+    }
+    else {
+      current->start = 0;
+      current->end = 0;
+    }
+    current->os_list_count = abandoned_list_count; // max entries to visit in the os abandoned list
+  }
+  mi_assert_internal(current->start <= max_arena);
+}
+
+void _mi_arena_field_cursor_done(mi_arena_field_cursor_t* current) {
+  if (current->hold_visit_lock) {
+    mi_lock_release(&current->subproc->abandoned_os_visit_lock);
+    current->hold_visit_lock = false;
+  }
+}
+
+static mi_segment_t* mi_arena_segment_clear_abandoned_at(mi_arena_t* arena, mi_subproc_t* subproc, mi_bitmap_index_t bitmap_idx) {
+  // try to reclaim an abandoned segment in the arena atomically
+  if (!_mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx)) return NULL;
+  mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
+  mi_segment_t* segment = (mi_segment_t*)mi_arena_block_start(arena, bitmap_idx);
+  mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0);
+  // check that the segment belongs to our sub-process
+  // note: this is the reason we need the `abandoned_visit` lock in the case abandoned visiting is enabled.
+  //  without the lock an abandoned visit may otherwise fail to visit all abandoned segments in the sub-process.
+  //  for regular reclaim it is fine to miss one sometimes so without abandoned visiting we don't need the `abandoned_visit` lock.
+  if (segment->subproc != subproc) {
+    // it is from another sub-process, re-mark it and continue searching
+    const bool was_zero = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL);
+    mi_assert_internal(was_zero); MI_UNUSED(was_zero);
+    return NULL;
+  }
+  else {
+    // success, we unabandoned a segment in our sub-process
+    mi_atomic_decrement_relaxed(&subproc->abandoned_count);
+    return segment;
+  }
+}
+
+static mi_segment_t* mi_arena_segment_clear_abandoned_next_field(mi_arena_field_cursor_t* previous) {
+  const size_t max_arena = mi_arena_get_count();
+  size_t field_idx = mi_bitmap_index_field(previous->bitmap_idx);
+  size_t bit_idx = mi_bitmap_index_bit_in_field(previous->bitmap_idx);
+  // visit arena's (from the previous cursor)
+  for (; previous->start < previous->end; previous->start++, field_idx = 0, bit_idx = 0) {
+    // index wraps around
+    size_t arena_idx = (previous->start >= max_arena ? previous->start % max_arena : previous->start);
+    mi_arena_t* arena = mi_arena_from_index(arena_idx);
+    if (arena != NULL) {
+      bool has_lock = false;
+      // visit the abandoned fields (starting at previous_idx)
+      for (; field_idx < arena->field_count; field_idx++, bit_idx = 0) {
+        size_t field = mi_atomic_load_relaxed(&arena->blocks_abandoned[field_idx]);
+        if mi_unlikely(field != 0) { // skip zero fields quickly
+          // we only take the arena lock if there are actually abandoned segments present
+          if (!has_lock && mi_option_is_enabled(mi_option_visit_abandoned)) {
+            has_lock = (previous->visit_all ? (mi_lock_acquire(&arena->abandoned_visit_lock),true) : mi_lock_try_acquire(&arena->abandoned_visit_lock));
+            if (!has_lock) {
+              if (previous->visit_all) {
+                _mi_error_message(EFAULT, "internal error: failed to visit all abandoned segments due to failure to acquire the visitor lock");
+              }
+              // skip to next arena
+              break;
+            }
+          }
+          mi_assert_internal(has_lock || !mi_option_is_enabled(mi_option_visit_abandoned));
+          // visit each set bit in the field  (todo: maybe use `ctz` here?)
+          for (; bit_idx < MI_BITMAP_FIELD_BITS; bit_idx++) {
+            // pre-check if the bit is set
+            size_t mask = ((size_t)1 << bit_idx);
+            if mi_unlikely((field & mask) == mask) {
+              mi_bitmap_index_t bitmap_idx = mi_bitmap_index_create(field_idx, bit_idx);
+              mi_segment_t* const segment = mi_arena_segment_clear_abandoned_at(arena, previous->subproc, bitmap_idx);
+              if (segment != NULL) {
+                //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
+                if (has_lock) { mi_lock_release(&arena->abandoned_visit_lock); }
+                previous->bitmap_idx = mi_bitmap_index_create_ex(field_idx, bit_idx + 1); // start at next one for the next iteration
+                return segment;
+              }
+            }
+          }
+        }
+      }
+      if (has_lock) { mi_lock_release(&arena->abandoned_visit_lock); }
+    }
+  }
+  return NULL;
+}
+
+static mi_segment_t* mi_arena_segment_clear_abandoned_next_list(mi_arena_field_cursor_t* previous) {
+  // go through the abandoned_os_list
+  // we only allow one thread per sub-process to do to visit guarded by the `abandoned_os_visit_lock`.
+  // The lock is released when the cursor is released.
+  if (!previous->hold_visit_lock) {
+    previous->hold_visit_lock = (previous->visit_all ? (mi_lock_acquire(&previous->subproc->abandoned_os_visit_lock),true)
+                                                     : mi_lock_try_acquire(&previous->subproc->abandoned_os_visit_lock));
+    if (!previous->hold_visit_lock) {
+      if (previous->visit_all) {
+        _mi_error_message(EFAULT, "internal error: failed to visit all abandoned segments due to failure to acquire the OS visitor lock");
+      }
+      return NULL; // we cannot get the lock, give up
+    }
+  }
+  // One list entry at a time
+  while (previous->os_list_count > 0) {
+    previous->os_list_count--;
+    mi_lock_acquire(&previous->subproc->abandoned_os_lock); // this could contend with concurrent OS block abandonment and reclaim from `free`
+    mi_segment_t* segment = previous->subproc->abandoned_os_list;
+    // pop from head of the list, a subsequent mark will push at the end (and thus we iterate through os_list_count entries)
+    if (segment == NULL || mi_arena_segment_os_clear_abandoned(segment, false /* we already have the lock */)) {
+      mi_lock_release(&previous->subproc->abandoned_os_lock);
+      return segment;
+    }
+    // already abandoned, try again
+    mi_lock_release(&previous->subproc->abandoned_os_lock);
+  }
+  // done
+  mi_assert_internal(previous->os_list_count == 0);
+  return NULL;
+}
+
+
+// reclaim abandoned segments
+// this does not set the thread id (so it appears as still abandoned)
+mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous) {
+  if (previous->start < previous->end) {
+    // walk the arena
+    mi_segment_t* segment = mi_arena_segment_clear_abandoned_next_field(previous);
+    if (segment != NULL) { return segment; }
+  }
+  // no entries in the arena's anymore, walk the abandoned OS list
+  mi_assert_internal(previous->start == previous->end);
+  return mi_arena_segment_clear_abandoned_next_list(previous);
+}
+
+
+bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
+  // (unfortunately) the visit_abandoned option must be enabled from the start.
+  // This is to avoid taking locks if abandoned list visiting is not required (as for most programs)
+  if (!mi_option_is_enabled(mi_option_visit_abandoned)) {
+    _mi_error_message(EFAULT, "internal error: can only visit abandoned blocks when MIMALLOC_VISIT_ABANDONED=ON");
+    return false;
+  }
+  mi_arena_field_cursor_t current;
+  _mi_arena_field_cursor_init(NULL, _mi_subproc_from_id(subproc_id), true /* visit all (blocking) */, &current);
+  mi_segment_t* segment;
+  bool ok = true;
+  while (ok && (segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL) {
+    ok = _mi_segment_visit_blocks(segment, heap_tag, visit_blocks, visitor, arg);
+    _mi_arena_segment_mark_abandoned(segment);
+  }
+  _mi_arena_field_cursor_done(&current);
+  return ok;
+}
diff --git a/third-party/mimalloc/src/arena.c b/third-party/mimalloc/src/arena.c
index 648ee844fe..a62a547b68 100644
--- a/third-party/mimalloc/src/arena.c
+++ b/third-party/mimalloc/src/arena.c
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2019-2023, Microsoft Research, Daan Leijen
+Copyright (c) 2019-2024, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -11,69 +11,68 @@ large blocks (>= MI_ARENA_MIN_BLOCK_SIZE, 4MiB).
 In contrast to the rest of mimalloc, the arenas are shared between
 threads and need to be accessed using atomic operations.
 
-Arenas are used to for huge OS page (1GiB) reservations or for reserving
+Arenas are also used to for huge OS page (1GiB) reservations or for reserving
 OS memory upfront which can be improve performance or is sometimes needed
 on embedded devices. We can also employ this with WASI or `sbrk` systems
 to reserve large arenas upfront and be able to reuse the memory more effectively.
 
 The arena allocation needs to be thread safe and we use an atomic bitmap to allocate.
 -----------------------------------------------------------------------------*/
+
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
 #include "mimalloc/atomic.h"
+#include "bitmap.h"
 
-#include <string.h>  // memset
-#include <errno.h>   // ENOMEM
-
-#include "bitmap.h"  // atomic bitmap
 
 /* -----------------------------------------------------------
   Arena allocation
 ----------------------------------------------------------- */
 
-// Block info: bit 0 contains the `in_use` bit, the upper bits the
-// size in count of arena blocks.
-typedef uintptr_t mi_block_info_t;
-#define MI_ARENA_BLOCK_SIZE   (MI_SEGMENT_SIZE)        // 64MiB  (must be at least MI_SEGMENT_ALIGN)
-#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2)  // 32MiB
-#define MI_MAX_ARENAS         (112)                    // not more than 126 (since we use 7 bits in the memid and an arena index + 1)
-
 // A memory arena descriptor
 typedef struct mi_arena_s {
-  mi_arena_id_t id;                       // arena id; 0 for non-specific
-  mi_memid_t memid;                       // memid of the memory area
-  _Atomic(uint8_t*) start;                // the start of the memory area
-  size_t   block_count;                   // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`)
-  size_t   field_count;                   // number of bitmap fields (where `field_count * MI_BITMAP_FIELD_BITS >= block_count`)
-  size_t   meta_size;                     // size of the arena structure itself (including its bitmaps)
-  mi_memid_t meta_memid;                  // memid of the arena structure itself (OS or static allocation)
-  int      numa_node;                     // associated NUMA node
-  bool     exclusive;                     // only allow allocations if specifically for this arena
-  bool     is_large;                      // memory area consists of large- or huge OS pages (always committed)
-  _Atomic(size_t) search_idx;             // optimization to start the search for free blocks
-  _Atomic(mi_msecs_t) purge_expire;       // expiration time when blocks should be decommitted from `blocks_decommit`.  
-  mi_bitmap_field_t* blocks_dirty;        // are the blocks potentially non-zero?
-  mi_bitmap_field_t* blocks_committed;    // are the blocks committed? (can be NULL for memory that cannot be decommitted)
-  mi_bitmap_field_t* blocks_purge;        // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted)
-  mi_bitmap_field_t* blocks_abandoned;    // blocks that start with an abandoned segment. (This crosses API's but it is convenient to have here)
-  mi_bitmap_field_t  blocks_inuse[1];     // in-place bitmap of in-use blocks (of size `field_count`)
+  mi_arena_id_t       id;                   // arena id; 0 for non-specific
+  mi_memid_t          memid;                // memid of the memory area
+  _Atomic(uint8_t*)   start;                // the start of the memory area
+  size_t              block_count;          // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`)
+  size_t              field_count;          // number of bitmap fields (where `field_count * MI_BITMAP_FIELD_BITS >= block_count`)
+  size_t              meta_size;            // size of the arena structure itself (including its bitmaps)
+  mi_memid_t          meta_memid;           // memid of the arena structure itself (OS or static allocation)
+  int                 numa_node;            // associated NUMA node
+  bool                exclusive;            // only allow allocations if specifically for this arena
+  bool                is_large;             // memory area consists of large- or huge OS pages (always committed)
+  mi_lock_t           abandoned_visit_lock; // lock is only used when abandoned segments are being visited
+  _Atomic(size_t)     search_idx;           // optimization to start the search for free blocks
+  _Atomic(mi_msecs_t) purge_expire;         // expiration time when blocks should be purged from `blocks_purge`.
+  
+  mi_bitmap_field_t*  blocks_dirty;         // are the blocks potentially non-zero?
+  mi_bitmap_field_t*  blocks_committed;     // are the blocks committed? (can be NULL for memory that cannot be decommitted)
+  mi_bitmap_field_t*  blocks_purge;         // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted)
+  mi_bitmap_field_t*  blocks_abandoned;     // blocks that start with an abandoned segment. (This crosses API's but it is convenient to have here)
+  mi_bitmap_field_t   blocks_inuse[1];      // in-place bitmap of in-use blocks (of size `field_count`)
   // do not add further fields here as the dirty, committed, purged, and abandoned bitmaps follow the inuse bitmap fields.
 } mi_arena_t;
 
 
+#define MI_ARENA_BLOCK_SIZE   (MI_SEGMENT_SIZE)        // 64MiB  (must be at least MI_SEGMENT_ALIGN)
+#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2)  // 32MiB
+#define MI_MAX_ARENAS         (132)                    // Limited as the reservation exponentially increases (and takes up .bss)
+
 // The available arenas
 static mi_decl_cache_align _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS];
 static mi_decl_cache_align _Atomic(size_t)      mi_arena_count; // = 0
+static mi_decl_cache_align _Atomic(int64_t)     mi_arenas_purge_expire; // set if there exist purgeable arenas
 
-
-//static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept;
+#define MI_IN_ARENA_C
+#include "arena-abandon.c"
+#undef MI_IN_ARENA_C
 
 /* -----------------------------------------------------------
   Arena id's
   id = arena_index + 1
 ----------------------------------------------------------- */
 
-static size_t mi_arena_id_index(mi_arena_id_t id) {
+size_t mi_arena_id_index(mi_arena_id_t id) {
   return (size_t)(id <= 0 ? MI_MAX_ARENAS : id - 1);
 }
 
@@ -104,6 +103,16 @@ bool _mi_arena_memid_is_os_allocated(mi_memid_t memid) {
   return (memid.memkind == MI_MEM_OS);
 }
 
+size_t mi_arena_get_count(void) {
+  return mi_atomic_load_relaxed(&mi_arena_count);
+}
+
+mi_arena_t* mi_arena_from_index(size_t idx) {
+  mi_assert_internal(idx < mi_arena_get_count());
+  return mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[idx]);
+}
+
+
 /* -----------------------------------------------------------
   Arena allocations get a (currently) 16-bit memory id where the
   lower 8 bits are the arena id, and the upper bits the block index.
@@ -129,7 +138,7 @@ static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, mi_
   return memid;
 }
 
-static bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index) {
+bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index) {
   mi_assert_internal(memid.memkind == MI_MEM_ARENA);
   *arena_index = mi_arena_id_index(memid.mem.arena.id);
   *bitmap_index = memid.mem.arena.block_index;
@@ -140,10 +149,10 @@ static bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, mi_bit
 
 /* -----------------------------------------------------------
   Special static area for mimalloc internal structures
-  to avoid OS calls (for example, for the arena metadata)
+  to avoid OS calls (for example, for the arena metadata (~= 256b))
 ----------------------------------------------------------- */
 
-#define MI_ARENA_STATIC_MAX  (MI_INTPTR_SIZE*MI_KiB)  // 8 KiB on 64-bit
+#define MI_ARENA_STATIC_MAX  ((MI_INTPTR_SIZE/2)*MI_KiB)  // 4 KiB on 64-bit
 
 static mi_decl_cache_align uint8_t mi_arena_static[MI_ARENA_STATIC_MAX];  // must be cache aligned, see issue #895
 static mi_decl_cache_align _Atomic(size_t) mi_arena_static_top;
@@ -175,7 +184,7 @@ static void* mi_arena_static_zalloc(size_t size, size_t alignment, mi_memid_t* m
   return p;
 }
 
-static void* mi_arena_meta_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* stats) {
+void* _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid) {
   *memid = _mi_memid_none();
 
   // try static
@@ -183,7 +192,7 @@ static void* mi_arena_meta_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* st
   if (p != NULL) return p;
 
   // or fall back to the OS
-  p = _mi_os_alloc(size, memid, stats);
+  p = _mi_os_alloc(size, memid);
   if (p == NULL) return NULL;
 
   // zero the OS memory if needed
@@ -194,16 +203,16 @@ static void* mi_arena_meta_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* st
   return p;
 }
 
-static void mi_arena_meta_free(void* p, mi_memid_t memid, size_t size, mi_stats_t* stats) {
+void _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size) {
   if (mi_memkind_is_os(memid.memkind)) {
-    _mi_os_free(p, size, memid, stats);
+    _mi_os_free(p, size, memid);
   }
   else {
     mi_assert(memid.memkind == MI_MEM_STATIC);
   }
 }
 
-static void* mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex) {
+void* mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex) {
   return (arena->start + mi_arena_block_size(mi_bitmap_index_bit(bindex)));
 }
 
@@ -213,10 +222,10 @@ static void* mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex) {
 ----------------------------------------------------------- */
 
 // claim the `blocks_inuse` bits
-static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats)
+static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx)
 {
   size_t idx = 0; // mi_atomic_load_relaxed(&arena->search_idx);  // start from last search; ok to be relaxed as the exact start does not matter
-  if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx, stats)) {
+  if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx)) {
     mi_atomic_store_relaxed(&arena->search_idx, mi_bitmap_index_field(*bitmap_idx));  // start search from found location next time around
     return true;
   };
@@ -229,13 +238,13 @@ static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, mi_bitmap_index
 ----------------------------------------------------------- */
 
 static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t arena_index, size_t needed_bcount,
-                                                    bool commit, mi_memid_t* memid, mi_os_tld_t* tld)
+                                                    bool commit, mi_memid_t* memid)
 {
   MI_UNUSED(arena_index);
   mi_assert_internal(mi_arena_id_index(arena->id) == arena_index);
 
   mi_bitmap_index_t bitmap_index;
-  if (!mi_arena_try_claim(arena, needed_bcount, &bitmap_index, tld->stats)) return NULL;
+  if (!mi_arena_try_claim(arena, needed_bcount, &bitmap_index)) return NULL;
 
   // claimed it!
   void* p = mi_arena_block_start(arena, bitmap_index);
@@ -265,7 +274,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar
     _mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted);
     if (any_uncommitted) {
       bool commit_zero = false;
-      if (!_mi_os_commit(p, mi_arena_block_size(needed_bcount), &commit_zero, tld->stats)) {
+      if (!_mi_os_commit(p, mi_arena_block_size(needed_bcount), &commit_zero)) {
         memid->initially_committed = false;
       }
       else {
@@ -281,19 +290,19 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar
   return p;
 }
 
-// allocate in a speficic arena
+// allocate in a specific arena
 static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node, size_t size, size_t alignment,
-                                       bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld )
+                                       bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid )
 {
   MI_UNUSED_RELEASE(alignment);
-  mi_assert_internal(alignment <= MI_SEGMENT_ALIGN);
+  mi_assert(alignment <= MI_SEGMENT_ALIGN);
   const size_t bcount = mi_block_count_of_size(size);
   const size_t arena_index = mi_arena_id_index(arena_id);
   mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count));
   mi_assert_internal(size <= mi_arena_block_size(bcount));
 
   // Check arena suitability
-  mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]);
+  mi_arena_t* arena = mi_arena_from_index(arena_index);
   if (arena == NULL) return NULL;
   if (!allow_large && arena->is_large) return NULL;
   if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return NULL;
@@ -304,7 +313,7 @@ static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_no
   }
 
   // try to allocate
-  void* p = mi_arena_try_alloc_at(arena, arena_index, bcount, commit, memid, tld);
+  void* p = mi_arena_try_alloc_at(arena, arena_index, bcount, commit, memid);
   mi_assert_internal(p == NULL || _mi_is_aligned(p, alignment));
   return p;
 }
@@ -313,7 +322,7 @@ static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_no
 // allocate from an arena with fallback to the OS
 static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment,
                                                   bool commit, bool allow_large,
-                                                  mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld )
+                                                  mi_arena_id_t req_arena_id, mi_memid_t* memid )
 {
   MI_UNUSED(alignment);
   mi_assert_internal(alignment <= MI_SEGMENT_ALIGN);
@@ -323,21 +332,21 @@ static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, siz
   if (req_arena_id != _mi_arena_id_none()) {
     // try a specific arena if requested
     if (mi_arena_id_index(req_arena_id) < max_arena) {
-      void* p = mi_arena_try_alloc_at_id(req_arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
+      void* p = mi_arena_try_alloc_at_id(req_arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid);
       if (p != NULL) return p;
     }
   }
   else {
     // try numa affine allocation
     for (size_t i = 0; i < max_arena; i++) {
-      void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
+      void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid);
       if (p != NULL) return p;
     }
 
     // try from another numa node instead..
     if (numa_node >= 0) {  // if numa_node was < 0 (no specific affinity requested), all arena's have been tried already
       for (size_t i = 0; i < max_arena; i++) {
-        void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), false /* only proceed if not numa local */, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
+        void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), false /* only proceed if not numa local */, numa_node, size, alignment, commit, allow_large, req_arena_id, memid);
         if (p != NULL) return p;
       }
     }
@@ -346,11 +355,10 @@ static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, siz
 }
 
 // try to reserve a fresh arena space
-static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t *arena_id)
+static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t *arena_id)
 {
   if (_mi_preloading()) return false;  // use OS only while pre loading
-  if (req_arena_id != _mi_arena_id_none()) return false;
-
+  
   const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count);
   if (arena_count > (MI_MAX_ARENAS - 4)) return false;
 
@@ -361,8 +369,14 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
     arena_reserve = arena_reserve/4;  // be conservative if virtual reserve is not supported (for WASM for example)
   }
   arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_BLOCK_SIZE);
+  arena_reserve = _mi_align_up(arena_reserve, MI_SEGMENT_SIZE);
   if (arena_count >= 8 && arena_count <= 128) {
-    arena_reserve = ((size_t)1<<(arena_count/8)) * arena_reserve;  // scale up the arena sizes exponentially
+    // scale up the arena sizes exponentially every 8 entries (128 entries get to 589TiB)
+    const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16 );
+    size_t reserve = 0;
+    if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) {
+      arena_reserve = reserve;
+    }
   }
   if (arena_reserve < req_size) return false;  // should be able to at least handle the current allocation size
 
@@ -376,27 +390,28 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
 
 
 void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large,
-                              mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
+                              mi_arena_id_t req_arena_id, mi_memid_t* memid)
 {
-  mi_assert_internal(memid != NULL && tld != NULL);
+  mi_assert_internal(memid != NULL);
   mi_assert_internal(size > 0);
   *memid = _mi_memid_none();
 
-  const int numa_node = _mi_os_numa_node(tld); // current numa node
+  const int numa_node = _mi_os_numa_node(); // current numa node
 
   // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
-  if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) || req_arena_id != _mi_arena_id_none()) {  // is arena allocation allowed?
-    if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN && align_offset == 0) {
-      void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
+  if (!mi_option_is_enabled(mi_option_disallow_arena_alloc)) {  // is arena allocation allowed?
+    if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN && align_offset == 0) 
+    {
+      void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, memid);
       if (p != NULL) return p;
 
       // otherwise, try to first eagerly reserve a new arena
       if (req_arena_id == _mi_arena_id_none()) {
         mi_arena_id_t arena_id = 0;
-        if (mi_arena_reserve(size, allow_large, req_arena_id, &arena_id)) {
+        if (mi_arena_reserve(size, allow_large, &arena_id)) {
           // and try allocate in there
           mi_assert_internal(req_arena_id == _mi_arena_id_none());
-          p = mi_arena_try_alloc_at_id(arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
+          p = mi_arena_try_alloc_at_id(arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid);
           if (p != NULL) return p;
         }
       }
@@ -411,16 +426,16 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset
 
   // finally, fall back to the OS
   if (align_offset > 0) {
-    return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, tld->stats);
+    return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid);
   }
   else {
-    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, tld->stats);
+    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid);
   }
 }
 
-void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
+void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid)
 {
-  return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid, tld);
+  return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid);
 }
 
 
@@ -446,7 +461,7 @@ static long mi_arena_purge_delay(void) {
 
 // reset or decommit in an arena and update the committed/decommit bitmaps
 // assumes we own the area (i.e. blocks_in_use is claimed by us)
-static void mi_arena_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks, mi_stats_t* stats) {
+static void mi_arena_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks) {
   mi_assert_internal(arena->blocks_committed != NULL);
   mi_assert_internal(arena->blocks_purge != NULL);
   mi_assert_internal(!arena->memid.is_pinned);
@@ -455,7 +470,7 @@ static void mi_arena_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks,
   bool needs_recommit;
   if (_mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx)) {
     // all blocks are committed, we can purge freely
-    needs_recommit = _mi_os_purge(p, size, stats);
+    needs_recommit = _mi_os_purge(p, size);
   }
   else {
     // some blocks are not committed -- this can happen when a partially committed block is freed
@@ -463,8 +478,7 @@ static void mi_arena_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks,
     // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory),
     // and also undo the decommit stats (as it was already adjusted)
     mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits));
-    needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, stats);
-    if (needs_recommit) { _mi_stat_increase(&_mi_stats_main.committed, size); }
+    needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, 0);    
   }
 
   // clear the purged blocks
@@ -477,23 +491,26 @@ static void mi_arena_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks,
 
 // Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls.
 // Note: assumes we (still) own the area as we may purge immediately
-static void mi_arena_schedule_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks, mi_stats_t* stats) {
+static void mi_arena_schedule_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks) {
   mi_assert_internal(arena->blocks_purge != NULL);
   const long delay = mi_arena_purge_delay();
   if (delay < 0) return;  // is purging allowed at all?
 
   if (_mi_preloading() || delay == 0) {
     // decommit directly
-    mi_arena_purge(arena, bitmap_idx, blocks, stats);
+    mi_arena_purge(arena, bitmap_idx, blocks);
   }
   else {
-    // schedule decommit
-    mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
-    if (expire != 0) {
-      mi_atomic_addi64_acq_rel(&arena->purge_expire, (mi_msecs_t)(delay/10));  // add smallish extra delay
+    // schedule purge
+    const mi_msecs_t expire = _mi_clock_now() + delay;
+    mi_msecs_t expire0 = 0;
+    if (mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire0, expire)) {
+      // expiration was not yet set
+      // maybe set the global arenas expire as well (if it wasn't set already)
+      mi_atomic_casi64_strong_acq_rel(&mi_arenas_purge_expire, &expire0, expire);
     }
     else {
-      mi_atomic_storei64_release(&arena->purge_expire, _mi_clock_now() + delay);
+      // already an expiration was set
     }
     _mi_bitmap_claim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx, NULL);
   }
@@ -502,12 +519,12 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t bitmap_idx, size_t
 // purge a range of blocks
 // return true if the full range was purged.
 // assumes we own the area (i.e. blocks_in_use is claimed by us)
-static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startidx, size_t bitlen, size_t purge, mi_stats_t* stats) {
+static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startidx, size_t bitlen, size_t purge) {
   const size_t endidx = startidx + bitlen;
   size_t bitidx = startidx;
   bool all_purged = false;
   while (bitidx < endidx) {
-    // count consequetive ones in the purge mask
+    // count consecutive ones in the purge mask
     size_t count = 0;
     while (bitidx + count < endidx && (purge & ((size_t)1 << (bitidx + count))) != 0) {
       count++;
@@ -515,7 +532,7 @@ static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startidx,
     if (count > 0) {
       // found range to be purged
       const mi_bitmap_index_t range_idx = mi_bitmap_index_create(idx, bitidx);
-      mi_arena_purge(arena, range_idx, count, stats);
+      mi_arena_purge(arena, range_idx, count);
       if (count == bitlen) {
         all_purged = true;
       }
@@ -526,16 +543,18 @@ static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startidx,
 }
 
 // returns true if anything was purged
-static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi_stats_t* stats)
+static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force)
 {
-  if (arena->memid.is_pinned || arena->blocks_purge == NULL) return false;
+  // check pre-conditions
+  if (arena->memid.is_pinned) return false;
+   
+  // expired yet?
   mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
-  if (expire == 0) return false;
-  if (!force && expire > now) return false;
+  if (!force && (expire == 0 || expire > now)) return false;
 
   // reset expire (if not already set concurrently)
   mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, (mi_msecs_t)0);
-
+  
   // potential purges scheduled, walk through the bitmap
   bool any_purged = false;
   bool full_purge = true;
@@ -544,11 +563,12 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi
     if (purge != 0) {
       size_t bitidx = 0;
       while (bitidx < MI_BITMAP_FIELD_BITS) {
-        // find consequetive range of ones in the purge mask
+        // find consecutive range of ones in the purge mask
         size_t bitlen = 0;
         while (bitidx + bitlen < MI_BITMAP_FIELD_BITS && (purge & ((size_t)1 << (bitidx + bitlen))) != 0) {
           bitlen++;
         }
+        // temporarily claim the purge range as "in-use" to be thread-safe with allocation
         // try to claim the longest range of corresponding in_use bits
         const mi_bitmap_index_t bitmap_index = mi_bitmap_index_create(i, bitidx);
         while( bitlen > 0 ) {
@@ -561,7 +581,7 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi
         if (bitlen > 0) {
           // read purge again now that we have the in_use bits
           purge = mi_atomic_load_acquire(&arena->blocks_purge[i]);
-          if (!mi_arena_purge_range(arena, i, bitidx, bitlen, purge, stats)) {
+          if (!mi_arena_purge_range(arena, i, bitidx, bitlen, purge)) {
             full_purge = false;
           }
           any_purged = true;
@@ -581,9 +601,15 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi
   return any_purged;
 }
 
-static void mi_arenas_try_purge( bool force, bool visit_all, mi_stats_t* stats ) {
+static void mi_arenas_try_purge( bool force, bool visit_all ) 
+{
   if (_mi_preloading() || mi_arena_purge_delay() <= 0) return;  // nothing will be scheduled
 
+  // check if any arena needs purging?
+  const mi_msecs_t now = _mi_clock_now();
+  mi_msecs_t arenas_expire = mi_atomic_loadi64_acquire(&mi_arenas_purge_expire);
+  if (!force && (arenas_expire == 0 || arenas_expire < now)) return;
+
   const size_t max_arena = mi_atomic_load_acquire(&mi_arena_count);
   if (max_arena == 0) return;
 
@@ -591,17 +617,26 @@ static void mi_arenas_try_purge( bool force, bool visit_all, mi_stats_t* stats )
   static mi_atomic_guard_t purge_guard;
   mi_atomic_guard(&purge_guard)
   {
-    mi_msecs_t now = _mi_clock_now();
-    size_t max_purge_count = (visit_all ? max_arena : 1);
+    // increase global expire: at most one purge per delay cycle
+    mi_atomic_storei64_release(&mi_arenas_purge_expire, now + mi_arena_purge_delay());  
+    size_t max_purge_count = (visit_all ? max_arena : 2);
+    bool all_visited = true;
     for (size_t i = 0; i < max_arena; i++) {
       mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
       if (arena != NULL) {
-        if (mi_arena_try_purge(arena, now, force, stats)) {
-          if (max_purge_count <= 1) break;
+        if (mi_arena_try_purge(arena, now, force)) {
+          if (max_purge_count <= 1) {
+            all_visited = false;
+            break;
+          }
           max_purge_count--;
         }
       }
     }
+    if (all_visited) {
+      // all arena's were visited and purged: reset global expire
+      mi_atomic_storei64_release(&mi_arenas_purge_expire, 0);
+    }
   }
 }
 
@@ -610,20 +645,23 @@ static void mi_arenas_try_purge( bool force, bool visit_all, mi_stats_t* stats )
   Arena free
 ----------------------------------------------------------- */
 
-void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid, mi_stats_t* stats) {
-  mi_assert_internal(size > 0 && stats != NULL);
+void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid) {
+  mi_assert_internal(size > 0);
   mi_assert_internal(committed_size <= size);
   if (p==NULL) return;
   if (size==0) return;
   const bool all_committed = (committed_size == size);
 
+  // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.)
+  mi_track_mem_undefined(p,size);
+
   if (mi_memkind_is_os(memid.memkind)) {
     // was a direct OS allocation, pass through
     if (!all_committed && committed_size > 0) {
       // if partially committed, adjust the committed stats (as `_mi_os_free` will increase decommit by the full size)
       _mi_stat_decrease(&_mi_stats_main.committed, committed_size);
     }
-    _mi_os_free(p, size, memid, stats);
+    _mi_os_free(p, size, memid);
   }
   else if (memid.memkind == MI_MEM_ARENA) {
     // allocated in an arena
@@ -646,9 +684,6 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
       return;
     }
 
-    // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.)
-    mi_track_mem_undefined(p,size);
-
     // potentially decommit
     if (arena->memid.is_pinned || arena->blocks_committed == NULL) {
       mi_assert_internal(all_committed);
@@ -671,7 +706,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
         // works (as we should never reset decommitted parts).
       }
       // (delay) purge the entire range
-      mi_arena_schedule_purge(arena, bitmap_idx, blocks, stats);
+      mi_arena_schedule_purge(arena, bitmap_idx, blocks);
     }
 
     // and make it available to others again
@@ -687,7 +722,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
   }
 
   // purge expired decommits
-  mi_arenas_try_purge(false, false, stats);
+  mi_arenas_try_purge(false, false);
 }
 
 // destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
@@ -698,14 +733,15 @@ static void mi_arenas_unsafe_destroy(void) {
   for (size_t i = 0; i < max_arena; i++) {
     mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
     if (arena != NULL) {
+      mi_lock_done(&arena->abandoned_visit_lock);
       if (arena->start != NULL && mi_memkind_is_os(arena->memid.memkind)) {
         mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL);
-        _mi_os_free(arena->start, mi_arena_size(arena), arena->memid, &_mi_stats_main);
+        _mi_os_free(arena->start, mi_arena_size(arena), arena->memid);
       }
       else {
         new_max_arena = i;
       }
-      mi_arena_meta_free(arena, arena->meta_memid, arena->meta_size, &_mi_stats_main);
+      _mi_arena_meta_free(arena, arena->meta_memid, arena->meta_size);
     }
   }
 
@@ -715,22 +751,22 @@ static void mi_arenas_unsafe_destroy(void) {
 }
 
 // Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired
-void _mi_arenas_collect(bool force_purge, mi_stats_t* stats) {
-  mi_arenas_try_purge(force_purge, force_purge /* visit all? */, stats);
+void _mi_arenas_collect(bool force_purge) {
+  mi_arenas_try_purge(force_purge, force_purge /* visit all? */);
 }
 
 // destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
 // for dynamic libraries that are unloaded and need to release all their allocated memory.
-void _mi_arena_unsafe_destroy_all(mi_stats_t* stats) {
+void _mi_arena_unsafe_destroy_all(void) {
   mi_arenas_unsafe_destroy();
-  _mi_arenas_collect(true /* force purge */, stats);  // purge non-owned arenas
+  _mi_arenas_collect(true /* force purge */);  // purge non-owned arenas
 }
 
 // Is a pointer inside any of our arenas?
 bool _mi_arena_contains(const void* p) {
   const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
   for (size_t i = 0; i < max_arena; i++) {
-    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
+    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
     if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_arena_block_size(arena->block_count) > (const uint8_t*)p) {
       return true;
     }
@@ -738,139 +774,6 @@ bool _mi_arena_contains(const void* p) {
   return false;
 }
 
-/* -----------------------------------------------------------
-  Abandoned blocks/segments.
-  This is used to atomically abandon/reclaim segments 
-  (and crosses the arena API but it is convenient to have here).
-  Abandoned segments still have live blocks; they get reclaimed
-  when a thread frees a block in it, or when a thread needs a fresh
-  segment; these threads scan the abandoned segments through
-  the arena bitmaps.
------------------------------------------------------------ */
-
-// Maintain a count of all abandoned segments
-static mi_decl_cache_align _Atomic(size_t)abandoned_count;
-
-size_t _mi_arena_segment_abandoned_count(void) {
-  return mi_atomic_load_relaxed(&abandoned_count);
-}
-
-// reclaim a specific abandoned segment; `true` on success.
-// sets the thread_id.
-bool _mi_arena_segment_clear_abandoned(mi_segment_t* segment ) 
-{
-  if (segment->memid.memkind != MI_MEM_ARENA) {
-    // not in an arena, consider it un-abandoned now.
-    // but we need to still claim it atomically -- we use the thread_id for that.
-    size_t expected = 0;
-    if (mi_atomic_cas_strong_acq_rel(&segment->thread_id, &expected, _mi_thread_id())) {
-      mi_atomic_decrement_relaxed(&abandoned_count);
-      return true;
-    }
-    else {
-      return false;
-    }
-  }
-  // arena segment: use the blocks_abandoned bitmap.
-  size_t arena_idx;
-  size_t bitmap_idx;
-  mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx);
-  mi_assert_internal(arena_idx < MI_MAX_ARENAS);
-  mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]);
-  mi_assert_internal(arena != NULL);
-  bool was_marked = _mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx);
-  if (was_marked) { 
-    mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0);
-    mi_atomic_decrement_relaxed(&abandoned_count); 
-    mi_atomic_store_release(&segment->thread_id, _mi_thread_id());
-  }
-  // mi_assert_internal(was_marked);
-  mi_assert_internal(!was_marked || _mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
-  //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
-  return was_marked;
-}
-
-// mark a specific segment as abandoned
-// clears the thread_id.
-void _mi_arena_segment_mark_abandoned(mi_segment_t* segment) 
-{
-  mi_atomic_store_release(&segment->thread_id, 0);
-  mi_assert_internal(segment->used == segment->abandoned);
-  if (segment->memid.memkind != MI_MEM_ARENA) {
-    // not in an arena; count it as abandoned and return
-    mi_atomic_increment_relaxed(&abandoned_count);
-    return;
-  }
-  size_t arena_idx;
-  size_t bitmap_idx;
-  mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx);
-  mi_assert_internal(arena_idx < MI_MAX_ARENAS);
-  mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]);
-  mi_assert_internal(arena != NULL);
-  const bool was_unmarked = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL);
-  if (was_unmarked) { mi_atomic_increment_relaxed(&abandoned_count); }
-  mi_assert_internal(was_unmarked);
-  mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
-}
-
-// start a cursor at a randomized arena
-void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_arena_field_cursor_t* current) {
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
-  current->start = (max_arena == 0 ? 0 : (mi_arena_id_t)( _mi_heap_random_next(heap) % max_arena));
-  current->count = 0;
-  current->bitmap_idx = 0;  
-}
-
-// reclaim abandoned segments 
-// this does not set the thread id (so it appears as still abandoned)
-mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous ) 
-{
-  const int max_arena = (int)mi_atomic_load_relaxed(&mi_arena_count);
-  if (max_arena <= 0 || mi_atomic_load_relaxed(&abandoned_count) == 0) return NULL;
-
-  int count = previous->count;
-  size_t field_idx = mi_bitmap_index_field(previous->bitmap_idx);
-  size_t bit_idx = mi_bitmap_index_bit_in_field(previous->bitmap_idx) + 1;
-  // visit arena's (from previous)
-  for (; count < max_arena; count++, field_idx = 0, bit_idx = 0) {
-    mi_arena_id_t arena_idx = previous->start + count;
-    if (arena_idx >= max_arena) { arena_idx = arena_idx % max_arena; } // wrap around
-    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]);
-    if (arena != NULL) {
-      // visit the abandoned fields (starting at previous_idx)
-      for ( ; field_idx < arena->field_count; field_idx++, bit_idx = 0) {
-        size_t field = mi_atomic_load_relaxed(&arena->blocks_abandoned[field_idx]);
-        if mi_unlikely(field != 0) { // skip zero fields quickly
-          // visit each set bit in the field  (todo: maybe use `ctz` here?)
-          for ( ; bit_idx < MI_BITMAP_FIELD_BITS; bit_idx++) {
-            // pre-check if the bit is set
-            size_t mask = ((size_t)1 << bit_idx);
-            if mi_unlikely((field & mask) == mask) {
-              mi_bitmap_index_t bitmap_idx = mi_bitmap_index_create(field_idx, bit_idx);
-              // try to reclaim it atomically
-              if (_mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx)) {
-                mi_atomic_decrement_relaxed(&abandoned_count);
-                previous->bitmap_idx = bitmap_idx;
-                previous->count = count;
-                mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
-                mi_segment_t* segment = (mi_segment_t*)mi_arena_block_start(arena, bitmap_idx);
-                mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0);
-                //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
-                return segment;
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  // no more found
-  previous->bitmap_idx = 0;
-  previous->count = 0;
-  return NULL;
-}
-
-
 /* -----------------------------------------------------------
   Add an arena.
 ----------------------------------------------------------- */
@@ -896,18 +799,30 @@ static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t*
 static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept
 {
   if (arena_id != NULL) *arena_id = _mi_arena_id_none();
-  if (size < MI_ARENA_BLOCK_SIZE) return false;
-
+  if (size < MI_ARENA_BLOCK_SIZE) {
+    _mi_warning_message("the arena size is too small (memory at %p with size %zu)\n", start, size);
+    return false;
+  }
   if (is_large) {
     mi_assert_internal(memid.initially_committed && memid.is_pinned);
   }
+  if (!_mi_is_aligned(start, MI_SEGMENT_ALIGN)) {
+    void* const aligned_start = mi_align_up_ptr(start, MI_SEGMENT_ALIGN);
+    const size_t diff = (uint8_t*)aligned_start - (uint8_t*)start;
+    if (diff >= size || (size - diff) < MI_ARENA_BLOCK_SIZE) {
+      _mi_warning_message("after alignment, the size of the arena becomes too small (memory at %p with size %zu)\n", start, size);
+      return false;
+    }
+    start = aligned_start;
+    size = size - diff;
+  }
 
   const size_t bcount = size / MI_ARENA_BLOCK_SIZE;
   const size_t fields = _mi_divide_up(bcount, MI_BITMAP_FIELD_BITS);
   const size_t bitmaps = (memid.is_pinned ? 3 : 5);
   const size_t asize  = sizeof(mi_arena_t) + (bitmaps*fields*sizeof(mi_bitmap_field_t));
   mi_memid_t meta_memid;
-  mi_arena_t* arena   = (mi_arena_t*)mi_arena_meta_zalloc(asize, &meta_memid, &_mi_stats_main); // TODO: can we avoid allocating from the OS?
+  mi_arena_t* arena   = (mi_arena_t*)_mi_arena_meta_zalloc(asize, &meta_memid);
   if (arena == NULL) return false;
 
   // already zero'd due to zalloc
@@ -924,7 +839,8 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   arena->is_large     = is_large;
   arena->purge_expire = 0;
   arena->search_idx   = 0;
-  // consequetive bitmaps
+  mi_lock_init(&arena->abandoned_visit_lock);
+  // consecutive bitmaps
   arena->blocks_dirty     = &arena->blocks_inuse[fields];     // just after inuse bitmap
   arena->blocks_abandoned = &arena->blocks_inuse[2 * fields]; // just after dirty bitmap
   arena->blocks_committed = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[3*fields]); // just after abandoned bitmap
@@ -959,11 +875,11 @@ int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exc
   if (arena_id != NULL) *arena_id = _mi_arena_id_none();
   size = _mi_align_up(size, MI_ARENA_BLOCK_SIZE); // at least one block
   mi_memid_t memid;
-  void* start = _mi_os_alloc_aligned(size, MI_SEGMENT_ALIGN, commit, allow_large, &memid, &_mi_stats_main);
+  void* start = _mi_os_alloc_aligned(size, MI_SEGMENT_ALIGN, commit, allow_large, &memid);
   if (start == NULL) return ENOMEM;
   const bool is_large = memid.is_pinned; // todo: use separate is_large field?
   if (!mi_manage_os_memory_ex2(start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) {
-    _mi_os_free_ex(start, size, commit, memid, &_mi_stats_main);
+    _mi_os_free_ex(start, size, commit, memid);
     _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024));
     return ENOMEM;
   }
@@ -1011,11 +927,11 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_
   return inuse_count;
 }
 
-void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept {
+void mi_debug_show_arenas(bool show_inuse) mi_attr_noexcept {
   size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count);
   size_t inuse_total = 0;
-  size_t abandoned_total = 0;
-  size_t purge_total = 0;
+  //size_t abandoned_total = 0;
+  //size_t purge_total = 0;
   for (size_t i = 0; i < max_arenas; i++) {
     mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
     if (arena == NULL) break;
@@ -1026,16 +942,16 @@ void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge)
     if (arena->blocks_committed != NULL) {
       mi_debug_show_bitmap("  ", "committed blocks", arena->block_count, arena->blocks_committed, arena->field_count);
     }
-    if (show_abandoned) {
-      abandoned_total += mi_debug_show_bitmap("  ", "abandoned blocks", arena->block_count, arena->blocks_abandoned, arena->field_count);      
-    }
-    if (show_purge && arena->blocks_purge != NULL) {
-      purge_total += mi_debug_show_bitmap("  ", "purgeable blocks", arena->block_count, arena->blocks_purge, arena->field_count);
-    }
+    //if (show_abandoned) {
+    //  abandoned_total += mi_debug_show_bitmap("  ", "abandoned blocks", arena->block_count, arena->blocks_abandoned, arena->field_count);
+    //}
+    //if (show_purge && arena->blocks_purge != NULL) {
+    //  purge_total += mi_debug_show_bitmap("  ", "purgeable blocks", arena->block_count, arena->blocks_purge, arena->field_count);
+    //}
   }
   if (show_inuse)     _mi_verbose_message("total inuse blocks    : %zu\n", inuse_total);
-  if (show_abandoned) _mi_verbose_message("total abandoned blocks: %zu\n", abandoned_total);
-  if (show_purge)     _mi_verbose_message("total purgeable blocks: %zu\n", purge_total);
+  //if (show_abandoned) _mi_verbose_message("total abandoned blocks: %zu\n", abandoned_total);
+  //if (show_purge)     _mi_verbose_message("total purgeable blocks: %zu\n", purge_total);
 }
 
 
@@ -1059,7 +975,7 @@ int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_m
   _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages);
 
   if (!mi_manage_os_memory_ex2(p, hsize, true, numa_node, exclusive, memid, arena_id)) {
-    _mi_os_free(p, hsize, memid, &_mi_stats_main);
+    _mi_os_free(p, hsize, memid);
     return ENOMEM;
   }
   return 0;
@@ -1105,4 +1021,3 @@ int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserv
   if (err==0 && pages_reserved!=NULL) *pages_reserved = pages;
   return err;
 }
-
diff --git a/third-party/mimalloc/src/bitmap.c b/third-party/mimalloc/src/bitmap.c
index 4b6be66bcd..084082fb72 100644
--- a/third-party/mimalloc/src/bitmap.c
+++ b/third-party/mimalloc/src/bitmap.c
@@ -200,7 +200,7 @@ bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t
 // Try to atomically claim a sequence of `count` bits starting from the field
 // at `idx` in `bitmap` and crossing into subsequent fields. Returns `true` on success.
 // Only needs to consider crossing into the next fields (see `mi_bitmap_try_find_from_claim_across`)
-static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t idx, const size_t count, const size_t retries, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats)
+static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t idx, const size_t count, const size_t retries, mi_bitmap_index_t* bitmap_idx)
 {
   mi_assert_internal(bitmap_idx != NULL);
 
@@ -260,7 +260,7 @@ static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bit
   } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
 
   // claimed!
-  mi_stat_counter_increase(stats->arena_crossover_count,1);
+  mi_stat_counter_increase(_mi_stats_main.arena_crossover_count,1);
   *bitmap_idx = mi_bitmap_index_create(idx, initial_idx);
   return true;
 
@@ -280,10 +280,10 @@ static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bit
       newmap = (map & ~initial_mask);
     } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
   }
-  mi_stat_counter_increase(stats->arena_rollback_count,1);
+  mi_stat_counter_increase(_mi_stats_main.arena_rollback_count,1);
   // retry? (we make a recursive call instead of goto to be able to use const declarations)
   if (retries <= 2) {
-    return mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, retries+1, bitmap_idx, stats);
+    return mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, retries+1, bitmap_idx);
   }
   else {
     return false;
@@ -293,7 +293,7 @@ static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bit
 
 // Find `count` bits of zeros and set them to 1 atomically; returns `true` on success.
 // Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats) {
+bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx) {
   mi_assert_internal(count > 0);
   if (count <= 2) {
     // we don't bother with crossover fields for small counts
@@ -313,7 +313,7 @@ bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitm
     }
     */
     // if that fails, then try to claim across fields
-    if (mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, 0, bitmap_idx, stats)) {
+    if (mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, 0, bitmap_idx)) {
       return true;
     }
   }
diff --git a/third-party/mimalloc/src/bitmap.h b/third-party/mimalloc/src/bitmap.h
index d8316b83f4..f098dd8f13 100644
--- a/third-party/mimalloc/src/bitmap.h
+++ b/third-party/mimalloc/src/bitmap.h
@@ -35,9 +35,13 @@ typedef mi_bitmap_field_t*  mi_bitmap_t;
 typedef size_t mi_bitmap_index_t;
 
 // Create a bit index.
+static inline mi_bitmap_index_t mi_bitmap_index_create_ex(size_t idx, size_t bitidx) {
+  mi_assert_internal(bitidx <= MI_BITMAP_FIELD_BITS);
+  return (idx*MI_BITMAP_FIELD_BITS) + bitidx;
+}
 static inline mi_bitmap_index_t mi_bitmap_index_create(size_t idx, size_t bitidx) {
   mi_assert_internal(bitidx < MI_BITMAP_FIELD_BITS);
-  return (idx*MI_BITMAP_FIELD_BITS) + bitidx;
+  return mi_bitmap_index_create_ex(idx,bitidx);
 }
 
 // Create a bit index.
@@ -99,7 +103,7 @@ bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t
 
 // Find `count` bits of zeros and set them to 1 atomically; returns `true` on success.
 // Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats);
+bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
 
 // Set `count` bits at `bitmap_idx` to 0 atomically
 // Returns `true` if all `count` bits were 1 previously.
diff --git a/third-party/mimalloc/src/free.c b/third-party/mimalloc/src/free.c
index b9cb634616..f856da77a6 100644
--- a/third-party/mimalloc/src/free.c
+++ b/third-party/mimalloc/src/free.c
@@ -9,7 +9,6 @@ terms of the MIT license. A copy of the license can be found in the file
 // add includes help an IDE
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
-#include "mimalloc/atomic.h"
 #include "mimalloc/prim.h"   // _mi_prim_thread_id()
 #endif
 
@@ -35,7 +34,7 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
   if mi_unlikely(mi_check_is_double_free(page, block)) return;
   mi_check_padding(page, block);
   if (track_stats) { mi_stat_free(page, block); }
-  #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN
+  #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN && !MI_GUARDED
   if (!mi_page_is_huge(page)) {   // huge page content may be already decommitted
     memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
   }
@@ -54,8 +53,8 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
 }
 
 // Adjust a block that was allocated aligned, to the actual start of the block in the page.
-// note: this can be called from `mi_free_generic_mt` where a non-owning thread accesses the 
-// `page_start` and `block_size` fields; however these are constant and the page won't be 
+// note: this can be called from `mi_free_generic_mt` where a non-owning thread accesses the
+// `page_start` and `block_size` fields; however these are constant and the page won't be
 // deallocated (as the block we are freeing keeps it alive) and thus safe to read concurrently.
 mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p) {
   mi_assert_internal(page!=NULL && p!=NULL);
@@ -72,16 +71,30 @@ mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p) {
   return (mi_block_t*)((uintptr_t)p - adjust);
 }
 
+// forward declaration for a MI_GUARDED build
+#if MI_GUARDED
+static void mi_block_unguard(mi_page_t* page, mi_block_t* block, void* p); // forward declaration
+static inline void mi_block_check_unguard(mi_page_t* page, mi_block_t* block, void* p) {
+  if (mi_block_ptr_is_guarded(block, p)) { mi_block_unguard(page, block, p); }
+}
+#else
+static inline void mi_block_check_unguard(mi_page_t* page, mi_block_t* block, void* p) {
+  MI_UNUSED(page); MI_UNUSED(block); MI_UNUSED(p);
+}
+#endif
+
 // free a local pointer  (page parameter comes first for better codegen)
 static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept {
   MI_UNUSED(segment);
   mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(page, p) : (mi_block_t*)p);
+  mi_block_check_unguard(page, block, p);
   mi_free_block_local(page, block, true /* track stats */, true /* check for a full page */);
 }
 
 // free a pointer owned by another thread (page parameter comes first for better codegen)
 static void mi_decl_noinline mi_free_generic_mt(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept {
   mi_block_t* const block = _mi_page_ptr_unalign(page, p); // don't check `has_aligned` flag to avoid a race (issue #865)
+  mi_block_check_unguard(page, block, p);
   mi_free_block_mt(page, segment, block);
 }
 
@@ -98,17 +111,17 @@ static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* ms
 {
   MI_UNUSED(msg);
 
-#if (MI_DEBUG>0)
-  if mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0) {
+  #if (MI_DEBUG>0)
+  if mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0 && !mi_option_is_enabled(mi_option_guarded_precise)) {
     _mi_error_message(EINVAL, "%s: invalid (unaligned) pointer: %p\n", msg, p);
     return NULL;
   }
-#endif
+  #endif
 
   mi_segment_t* const segment = _mi_ptr_segment(p);
   if mi_unlikely(segment==NULL) return segment;
 
-#if (MI_DEBUG>0)
+  #if (MI_DEBUG>0)
   if mi_unlikely(!mi_is_in_heap_region(p)) {
   #if (MI_INTPTR_SIZE == 8 && defined(__linux__))
     if (((uintptr_t)p >> 40) != 0x7F) { // linux tends to align large blocks above 0x7F000000000 (issue #640)
@@ -122,13 +135,13 @@ static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* ms
       }
     }
   }
-#endif
-#if (MI_DEBUG>0 || MI_SECURE>=4)
+  #endif
+  #if (MI_DEBUG>0 || MI_SECURE>=4)
   if mi_unlikely(_mi_ptr_cookie(segment) != segment->cookie) {
     _mi_error_message(EINVAL, "%s: pointer does not point to a valid heap space: %p\n", msg, p);
     return NULL;
   }
-#endif
+  #endif
 
   return segment;
 }
@@ -240,15 +253,17 @@ static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block
 static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_segment_t* segment, mi_block_t* block)
 {
   // first see if the segment was abandoned and if we can reclaim it into our thread
-  if (mi_option_is_enabled(mi_option_abandoned_reclaim_on_free) &&
+  if (_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0 &&
       #if MI_HUGE_PAGE_ABANDON
       segment->page_kind != MI_PAGE_HUGE &&
       #endif
-      mi_atomic_load_relaxed(&segment->thread_id) == 0)
+      mi_atomic_load_relaxed(&segment->thread_id) == 0 &&  // segment is abandoned?
+      mi_prim_get_default_heap() != (mi_heap_t*)&_mi_heap_empty) // and we did not already exit this thread (without this check, a fresh heap will be initalized (issue #944))
   {
     // the segment is abandoned, try to reclaim it into our heap
     if (_mi_segment_attempt_reclaim(mi_heap_get_default(), segment)) {
-      mi_assert_internal(_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
+      mi_assert_internal(_mi_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
+      mi_assert_internal(mi_heap_get_default()->tld->segments.subproc == segment->subproc);
       mi_free(block);  // recursively free as now it will be a local free in our heap
       return;
     }
@@ -299,7 +314,13 @@ static size_t mi_decl_noinline mi_page_usable_aligned_size_of(const mi_page_t* p
   const size_t size = mi_page_usable_size_of(page, block);
   const ptrdiff_t adjust = (uint8_t*)p - (uint8_t*)block;
   mi_assert_internal(adjust >= 0 && (size_t)adjust <= size);
-  return (size - adjust);
+  const size_t aligned_size = (size - adjust);
+  #if MI_GUARDED
+  if (mi_block_ptr_is_guarded(block, p)) {
+    return aligned_size - _mi_os_page_size();
+  }
+  #endif
+  return aligned_size;
 }
 
 static inline size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noexcept {
@@ -409,7 +430,7 @@ static bool mi_page_decode_padding(const mi_page_t* page, const mi_block_t* bloc
   uintptr_t keys[2];
   keys[0] = page->keys[0];
   keys[1] = page->keys[1];
-  bool ok = ((uint32_t)mi_ptr_encode(page,block,keys) == canary && *delta <= *bsize);
+  bool ok = (mi_ptr_encode_canary(page,block,keys) == canary && *delta <= *bsize);
   mi_track_mem_noaccess(padding,sizeof(mi_padding_t));
   return ok;
 }
@@ -528,3 +549,23 @@ static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
   MI_UNUSED(page); MI_UNUSED(block);
 }
 #endif
+
+
+// Remove guard page when building with MI_GUARDED
+#if MI_GUARDED
+static void mi_block_unguard(mi_page_t* page, mi_block_t* block, void* p) {
+  MI_UNUSED(p);
+  mi_assert_internal(mi_block_ptr_is_guarded(block, p));
+  mi_assert_internal(mi_page_has_aligned(page));
+  mi_assert_internal((uint8_t*)p - (uint8_t*)block >= (ptrdiff_t)sizeof(mi_block_t));
+  mi_assert_internal(block->next == MI_BLOCK_TAG_GUARDED);
+
+  const size_t bsize = mi_page_block_size(page);
+  const size_t psize = _mi_os_page_size();
+  mi_assert_internal(bsize > psize);
+  mi_assert_internal(_mi_page_segment(page)->allow_decommit);
+  void* gpage = (uint8_t*)block + bsize - psize;
+  mi_assert_internal(_mi_is_aligned(gpage, psize));
+  _mi_os_unprotect(gpage, psize);
+}
+#endif
diff --git a/third-party/mimalloc/src/heap.c b/third-party/mimalloc/src/heap.c
index e498fdb209..f39dfb0ba0 100644
--- a/third-party/mimalloc/src/heap.c
+++ b/third-party/mimalloc/src/heap.c
@@ -59,7 +59,7 @@ static bool mi_heap_page_is_valid(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
   MI_UNUSED(pq);
   mi_assert_internal(mi_page_heap(page) == heap);
   mi_segment_t* segment = _mi_page_segment(page);
-  mi_assert_internal(segment->thread_id == heap->thread_id);
+  mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == heap->thread_id);
   mi_assert_expensive(_mi_page_is_valid(page));
   return true;
 }
@@ -98,7 +98,7 @@ static bool mi_heap_page_collect(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t
   if (collect == MI_FORCE) {
     // note: call before a potential `_mi_page_free` as the segment may be freed if this was the last used page in that segment.
     mi_segment_t* segment = _mi_page_segment(page);
-    _mi_segment_collect(segment, true /* force? */, &heap->tld->segments);
+    _mi_segment_collect(segment, true /* force? */);
   }
   if (mi_page_all_free(page)) {
     // no more used blocks, free the page.
@@ -143,6 +143,7 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
   if (force_main) {
     // the main thread is abandoned (end-of-program), try to reclaim all abandoned segments.
     // if all memory is freed by now, all segments should be freed.
+    // note: this only collects in the current subprocess
     _mi_abandoned_reclaim_all(heap, &heap->tld->segments);
   }
 
@@ -170,9 +171,9 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
   if (force && is_main_thread && mi_heap_is_backing(heap)) {
     _mi_thread_data_collect();  // collect thread data cache
   }
-  
+
   // collect arenas (this is program wide so don't force purges on abandonment of threads)
-  _mi_arenas_collect(collect == MI_FORCE /* force purge? */, &heap->tld->stats);  
+  _mi_arenas_collect(collect == MI_FORCE /* force purge? */);
 }
 
 void _mi_heap_collect_abandon(mi_heap_t* heap) {
@@ -227,22 +228,28 @@ void _mi_heap_init(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool
   heap->cookie  = _mi_heap_random_next(heap) | 1;
   heap->keys[0] = _mi_heap_random_next(heap);
   heap->keys[1] = _mi_heap_random_next(heap);
+  _mi_heap_guarded_init(heap);
   // push on the thread local heaps list
   heap->next = heap->tld->heaps;
   heap->tld->heaps = heap;
 }
 
-mi_decl_nodiscard mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id) {
+mi_decl_nodiscard mi_heap_t* mi_heap_new_ex(int heap_tag, bool allow_destroy, mi_arena_id_t arena_id) {
   mi_heap_t* bheap = mi_heap_get_backing();
   mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t);  // todo: OS allocate in secure mode?
   if (heap == NULL) return NULL;
-  // don't reclaim abandoned pages or otherwise destroy is unsafe  
-  _mi_heap_init(heap, bheap->tld, arena_id, true /* no reclaim */, 0 /* default tag */);
+  mi_assert(heap_tag >= 0 && heap_tag < 256);
+  _mi_heap_init(heap, bheap->tld, arena_id, allow_destroy /* no reclaim? */, (uint8_t)heap_tag /* heap tag */);
   return heap;
 }
 
+mi_decl_nodiscard mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id) {
+  return mi_heap_new_ex(0 /* default heap tag */, false /* don't allow `mi_heap_destroy` */, arena_id);
+}
+
 mi_decl_nodiscard mi_heap_t* mi_heap_new(void) {
-  return mi_heap_new_in_arena(_mi_arena_id_none());
+  // don't reclaim abandoned memory or otherwise destroy is unsafe
+  return mi_heap_new_ex(0 /* default heap tag */, true /* no reclaim */, _mi_arena_id_none());
 }
 
 bool _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid) {
@@ -375,7 +382,13 @@ void mi_heap_destroy(mi_heap_t* heap) {
   mi_assert(heap->no_reclaim);
   mi_assert_expensive(mi_heap_is_valid(heap));
   if (heap==NULL || !mi_heap_is_initialized(heap)) return;
+  #if MI_GUARDED
+  // _mi_warning_message("'mi_heap_destroy' called but MI_GUARDED is enabled -- using `mi_heap_delete` instead (heap at %p)\n", heap);
+  mi_heap_delete(heap);
+  return;
+  #else
   if (!heap->no_reclaim) {
+    _mi_warning_message("'mi_heap_destroy' called but ignored as the heap was not created with 'allow_destroy' (heap at %p)\n", heap);
     // don't free in case it may contain reclaimed pages
     mi_heap_delete(heap);
   }
@@ -388,12 +401,14 @@ void mi_heap_destroy(mi_heap_t* heap) {
     _mi_heap_destroy_pages(heap);
     mi_heap_free(heap);
   }
+  #endif
 }
 
 // forcefully destroy all heaps in the current thread
-void _mi_heap_unsafe_destroy_all(void) {
-  mi_heap_t* bheap = mi_heap_get_backing();
-  mi_heap_t* curr = bheap->tld->heaps;
+void _mi_heap_unsafe_destroy_all(mi_heap_t* heap) {
+  mi_assert_internal(heap != NULL);
+  if (heap == NULL) return;
+  mi_heap_t* curr = heap->tld->heaps;
   while (curr != NULL) {
     mi_heap_t* next = curr->next;
     if (curr->no_reclaim) {
@@ -444,6 +459,12 @@ static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) {
   mi_heap_reset_pages(from);
 }
 
+// are two heaps compatible with respect to heap-tag, exclusive arena etc.
+static bool mi_heaps_are_compatible(mi_heap_t* heap1, mi_heap_t* heap2) {
+  return (heap1->tag == heap2->tag &&                   // store same kind of objects
+          heap1->arena_id == heap2->arena_id);          // same arena preference
+}
+
 // Safe delete a heap without freeing any still allocated blocks in that heap.
 void mi_heap_delete(mi_heap_t* heap)
 {
@@ -452,9 +473,10 @@ void mi_heap_delete(mi_heap_t* heap)
   mi_assert_expensive(mi_heap_is_valid(heap));
   if (heap==NULL || !mi_heap_is_initialized(heap)) return;
 
-  if (!mi_heap_is_backing(heap)) {
+  mi_heap_t* bheap = heap->tld->heap_backing;
+  if (bheap != heap && mi_heaps_are_compatible(bheap,heap)) {
     // transfer still used pages to the backing heap
-    mi_heap_absorb(heap->tld->heap_backing, heap);
+    mi_heap_absorb(bheap, heap);
   }
   else {
     // the backing heap abandons its pages
@@ -527,54 +549,97 @@ bool mi_check_owned(const void* p) {
         enable visiting all blocks of all heaps across threads
 ----------------------------------------------------------- */
 
-// Separate struct to keep `mi_page_t` out of the public interface
-typedef struct mi_heap_area_ex_s {
-  mi_heap_area_t area;
-  mi_page_t*     page;
-} mi_heap_area_ex_t;
+void _mi_heap_area_init(mi_heap_area_t* area, mi_page_t* page) {
+  const size_t bsize = mi_page_block_size(page);
+  const size_t ubsize = mi_page_usable_block_size(page);
+  area->reserved = page->reserved * bsize;
+  area->committed = page->capacity * bsize;
+  area->blocks = mi_page_start(page);
+  area->used = page->used;   // number of blocks in use (#553)
+  area->block_size = ubsize;
+  area->full_block_size = bsize;
+  area->heap_tag = page->heap_tag;
+}
+
+
+static void mi_get_fast_divisor(size_t divisor, uint64_t* magic, size_t* shift) {
+  mi_assert_internal(divisor > 0 && divisor <= UINT32_MAX);
+  *shift = MI_INTPTR_BITS - mi_clz(divisor - 1);
+  *magic = ((((uint64_t)1 << 32) * (((uint64_t)1 << *shift) - divisor)) / divisor + 1);
+}
+
+static size_t mi_fast_divide(size_t n, uint64_t magic, size_t shift) {
+  mi_assert_internal(n <= UINT32_MAX);
+  const uint64_t hi = ((uint64_t)n * magic) >> 32;
+  return (size_t)((hi + n) >> shift);
+}
 
-static bool mi_heap_area_visit_blocks(const mi_heap_area_ex_t* xarea, mi_block_visit_fun* visitor, void* arg) {
-  mi_assert(xarea != NULL);
-  if (xarea==NULL) return true;
-  const mi_heap_area_t* area = &xarea->area;
-  mi_page_t* page = xarea->page;
+bool _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_block_visit_fun* visitor, void* arg) {
+  mi_assert(area != NULL);
+  if (area==NULL) return true;
   mi_assert(page != NULL);
   if (page == NULL) return true;
 
-  _mi_page_free_collect(page,true);
+  _mi_page_free_collect(page,true);              // collect both thread_delayed and local_free
   mi_assert_internal(page->local_free == NULL);
   if (page->used == 0) return true;
 
-  const size_t bsize = mi_page_block_size(page);
-  const size_t ubsize = mi_page_usable_block_size(page); // without padding
-  size_t   psize;
-  uint8_t* pstart = _mi_segment_page_start(_mi_page_segment(page), page, &psize);
+  size_t psize;
+  uint8_t* const pstart = _mi_segment_page_start(_mi_page_segment(page), page, &psize);
+  mi_heap_t* const heap = mi_page_heap(page);
+  const size_t bsize    = mi_page_block_size(page);
+  const size_t ubsize   = mi_page_usable_block_size(page); // without padding
 
+  // optimize page with one block
   if (page->capacity == 1) {
-    // optimize page with one block
     mi_assert_internal(page->used == 1 && page->free == NULL);
     return visitor(mi_page_heap(page), area, pstart, ubsize, arg);
   }
+  mi_assert(bsize <= UINT32_MAX);
+
+  // optimize full pages
+  if (page->used == page->capacity) {
+    uint8_t* block = pstart;
+    for (size_t i = 0; i < page->capacity; i++) {
+      if (!visitor(heap, area, block, ubsize, arg)) return false;
+      block += bsize;
+    }
+    return true;
+  }
 
   // create a bitmap of free blocks.
   #define MI_MAX_BLOCKS   (MI_SMALL_PAGE_SIZE / sizeof(void*))
-  uintptr_t free_map[MI_MAX_BLOCKS / sizeof(uintptr_t)];
-  memset(free_map, 0, sizeof(free_map));
+  uintptr_t free_map[MI_MAX_BLOCKS / MI_INTPTR_BITS];
+  const uintptr_t bmapsize = _mi_divide_up(page->capacity, MI_INTPTR_BITS);
+  memset(free_map, 0, bmapsize * sizeof(intptr_t));
+  if (page->capacity % MI_INTPTR_BITS != 0) {
+    // mark left-over bits at the end as free
+    size_t shift   = (page->capacity % MI_INTPTR_BITS);
+    uintptr_t mask = (UINTPTR_MAX << shift);
+    free_map[bmapsize - 1] = mask;
+  }
+
+  // fast repeated division by the block size
+  uint64_t magic;
+  size_t   shift;
+  mi_get_fast_divisor(bsize, &magic, &shift);
 
   #if MI_DEBUG>1
   size_t free_count = 0;
   #endif
-  for (mi_block_t* block = page->free; block != NULL; block = mi_block_next(page,block)) {
+  for (mi_block_t* block = page->free; block != NULL; block = mi_block_next(page, block)) {
     #if MI_DEBUG>1
     free_count++;
     #endif
     mi_assert_internal((uint8_t*)block >= pstart && (uint8_t*)block < (pstart + psize));
     size_t offset = (uint8_t*)block - pstart;
     mi_assert_internal(offset % bsize == 0);
-    size_t blockidx = offset / bsize;  // Todo: avoid division?
-    mi_assert_internal( blockidx < MI_MAX_BLOCKS);
-    size_t bitidx = (blockidx / sizeof(uintptr_t));
-    size_t bit = blockidx - (bitidx * sizeof(uintptr_t));
+    mi_assert_internal(offset <= UINT32_MAX);
+    size_t blockidx = mi_fast_divide(offset, magic, shift);
+    mi_assert_internal(blockidx == offset / bsize);
+    mi_assert_internal(blockidx < MI_MAX_BLOCKS);
+    size_t bitidx = (blockidx / MI_INTPTR_BITS);
+    size_t bit = blockidx - (bitidx * MI_INTPTR_BITS);
     free_map[bitidx] |= ((uintptr_t)1 << bit);
   }
   mi_assert_internal(page->capacity == (free_count + page->used));
@@ -583,42 +648,53 @@ static bool mi_heap_area_visit_blocks(const mi_heap_area_ex_t* xarea, mi_block_v
   #if MI_DEBUG>1
   size_t used_count = 0;
   #endif
-  for (size_t i = 0; i < page->capacity; i++) {
-    size_t bitidx = (i / sizeof(uintptr_t));
-    size_t bit = i - (bitidx * sizeof(uintptr_t));
-    uintptr_t m = free_map[bitidx];
-    if (bit == 0 && m == UINTPTR_MAX) {
-      i += (sizeof(uintptr_t) - 1); // skip a run of free blocks
+  uint8_t* block = pstart;
+  for (size_t i = 0; i < bmapsize; i++) {
+    if (free_map[i] == 0) {
+      // every block is in use
+      for (size_t j = 0; j < MI_INTPTR_BITS; j++) {
+        #if MI_DEBUG>1
+        used_count++;
+        #endif
+        if (!visitor(heap, area, block, ubsize, arg)) return false;
+        block += bsize;
+      }
     }
-    else if ((m & ((uintptr_t)1 << bit)) == 0) {
-      #if MI_DEBUG>1
-      used_count++;
-      #endif
-      uint8_t* block = pstart + (i * bsize);
-      if (!visitor(mi_page_heap(page), area, block, ubsize, arg)) return false;
+    else {
+      // visit the used blocks in the mask
+      uintptr_t m = ~free_map[i];
+      while (m != 0) {
+        #if MI_DEBUG>1
+        used_count++;
+        #endif
+        size_t bitidx = mi_ctz(m);
+        if (!visitor(heap, area, block + (bitidx * bsize), ubsize, arg)) return false;
+        m &= m - 1;  // clear least significant bit
+      }
+      block += bsize * MI_INTPTR_BITS;
     }
   }
   mi_assert_internal(page->used == used_count);
   return true;
 }
 
-typedef bool (mi_heap_area_visit_fun)(const mi_heap_t* heap, const mi_heap_area_ex_t* area, void* arg);
 
 
+// Separate struct to keep `mi_page_t` out of the public interface
+typedef struct mi_heap_area_ex_s {
+  mi_heap_area_t area;
+  mi_page_t* page;
+} mi_heap_area_ex_t;
+
+typedef bool (mi_heap_area_visit_fun)(const mi_heap_t* heap, const mi_heap_area_ex_t* area, void* arg);
+
 static bool mi_heap_visit_areas_page(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* vfun, void* arg) {
   MI_UNUSED(heap);
   MI_UNUSED(pq);
   mi_heap_area_visit_fun* fun = (mi_heap_area_visit_fun*)vfun;
   mi_heap_area_ex_t xarea;
-  const size_t bsize = mi_page_block_size(page);
-  const size_t ubsize = mi_page_usable_block_size(page);
   xarea.page = page;
-  xarea.area.reserved = page->reserved * bsize;
-  xarea.area.committed = page->capacity * bsize;
-  xarea.area.blocks = mi_page_start(page);
-  xarea.area.used = page->used;   // number of blocks in use (#553)
-  xarea.area.block_size = ubsize;
-  xarea.area.full_block_size = bsize;
+  _mi_heap_area_init(&xarea.area, page);
   return fun(heap, &xarea, arg);
 }
 
@@ -639,7 +715,7 @@ static bool mi_heap_area_visitor(const mi_heap_t* heap, const mi_heap_area_ex_t*
   mi_visit_blocks_args_t* args = (mi_visit_blocks_args_t*)arg;
   if (!args->visitor(heap, &xarea->area, NULL, xarea->area.block_size, args->arg)) return false;
   if (args->visit_blocks) {
-    return mi_heap_area_visit_blocks(xarea, args->visitor, args->arg);
+    return _mi_heap_area_visit_blocks(&xarea->area, xarea->page, args->visitor, args->arg);
   }
   else {
     return true;
diff --git a/third-party/mimalloc/src/init.c b/third-party/mimalloc/src/init.c
index 6f51ca8923..a2d01db1dd 100644
--- a/third-party/mimalloc/src/init.c
+++ b/third-party/mimalloc/src/init.c
@@ -88,7 +88,7 @@ const mi_page_t _mi_page_empty = {
   { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
   { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
   { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
-  { 0, 0 } \
+  { 0, 0 }, { 0, 0 } \
   MI_STAT_COUNT_END_NULL()
 
 
@@ -125,19 +125,22 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
   NULL,             // next
   false,            // can reclaim
   0,                // tag
+  #if MI_GUARDED
+  0, 0, 0, 0, 1,    // count is 1 so we never write to it (see `internal.h:mi_heap_malloc_use_guarded`)
+  #endif
   MI_SMALL_PAGES_EMPTY,
   MI_PAGE_QUEUES_EMPTY
 };
 
+static mi_decl_cache_align mi_subproc_t mi_subproc_default;
+
 #define tld_empty_stats  ((mi_stats_t*)((uint8_t*)&tld_empty + offsetof(mi_tld_t,stats)))
-#define tld_empty_os     ((mi_os_tld_t*)((uint8_t*)&tld_empty + offsetof(mi_tld_t,os)))
 
 mi_decl_cache_align static const mi_tld_t tld_empty = {
   0,
   false,
   NULL, NULL,
-  { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, 0, tld_empty_stats, tld_empty_os }, // segments
-  { 0, tld_empty_stats }, // os
+  { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, 0, &mi_subproc_default, tld_empty_stats }, // segments
   { MI_STATS_NULL }       // stats
 };
 
@@ -150,15 +153,14 @@ mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;
 
 extern mi_heap_t _mi_heap_main;
 
-static mi_tld_t tld_main = {
+static mi_decl_cache_align mi_tld_t tld_main = {
   0, false,
   &_mi_heap_main, & _mi_heap_main,
-  { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, 0, &tld_main.stats, &tld_main.os }, // segments
-  { 0, &tld_main.stats },  // os
+  { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, 0, &mi_subproc_default, &tld_main.stats }, // segments
   { MI_STATS_NULL }       // stats
 };
 
-mi_heap_t _mi_heap_main = {
+mi_decl_cache_align mi_heap_t _mi_heap_main = {
   &tld_main,
   MI_ATOMIC_VAR_INIT(NULL),
   0,                // thread id
@@ -171,6 +173,9 @@ mi_heap_t _mi_heap_main = {
   NULL,             // next heap
   false,            // can reclaim
   0,                // tag
+  #if MI_GUARDED
+  0, 0, 0, 0, 0,
+  #endif
   MI_SMALL_PAGES_EMPTY,
   MI_PAGE_QUEUES_EMPTY
 };
@@ -179,6 +184,45 @@ bool _mi_process_is_initialized = false;  // set to `true` in `mi_process_init`.
 
 mi_stats_t _mi_stats_main = { MI_STATS_NULL };
 
+#if MI_GUARDED
+mi_decl_export void mi_heap_guarded_set_sample_rate(mi_heap_t* heap, size_t sample_rate, size_t seed) {
+  heap->guarded_sample_seed = seed;
+  if (heap->guarded_sample_seed == 0) {
+    heap->guarded_sample_seed = _mi_heap_random_next(heap);
+  }
+  heap->guarded_sample_rate  = sample_rate;
+  if (heap->guarded_sample_rate >= 1) {
+    heap->guarded_sample_seed = heap->guarded_sample_seed % heap->guarded_sample_rate;
+  }
+  heap->guarded_sample_count = heap->guarded_sample_seed;  // count down samples
+}
+
+mi_decl_export void mi_heap_guarded_set_size_bound(mi_heap_t* heap, size_t min, size_t max) {
+  heap->guarded_size_min = min;
+  heap->guarded_size_max = (min > max ? min : max);
+}
+
+void _mi_heap_guarded_init(mi_heap_t* heap) {
+  mi_heap_guarded_set_sample_rate(heap,
+    (size_t)mi_option_get_clamp(mi_option_guarded_sample_rate, 0, LONG_MAX),
+    (size_t)mi_option_get(mi_option_guarded_sample_seed));
+  mi_heap_guarded_set_size_bound(heap,
+    (size_t)mi_option_get_clamp(mi_option_guarded_min, 0, LONG_MAX),
+    (size_t)mi_option_get_clamp(mi_option_guarded_max, 0, LONG_MAX) );
+}
+#else
+mi_decl_export void mi_heap_guarded_set_sample_rate(mi_heap_t* heap, size_t sample_rate, size_t seed) {
+  MI_UNUSED(heap); MI_UNUSED(sample_rate); MI_UNUSED(seed);
+}
+
+mi_decl_export void mi_heap_guarded_set_size_bound(mi_heap_t* heap, size_t min, size_t max) {
+  MI_UNUSED(heap); MI_UNUSED(min); MI_UNUSED(max);
+}
+void _mi_heap_guarded_init(mi_heap_t* heap) {
+  MI_UNUSED(heap);
+}
+#endif
+
 
 static void mi_heap_main_init(void) {
   if (_mi_heap_main.cookie == 0) {
@@ -192,6 +236,9 @@ static void mi_heap_main_init(void) {
     _mi_heap_main.cookie  = _mi_heap_random_next(&_mi_heap_main);
     _mi_heap_main.keys[0] = _mi_heap_random_next(&_mi_heap_main);
     _mi_heap_main.keys[1] = _mi_heap_random_next(&_mi_heap_main);
+    mi_lock_init(&mi_subproc_default.abandoned_os_lock);
+    mi_lock_init(&mi_subproc_default.abandoned_os_visit_lock);
+    _mi_heap_guarded_init(&_mi_heap_main);
   }
 }
 
@@ -201,6 +248,57 @@ mi_heap_t* _mi_heap_main_get(void) {
 }
 
 
+/* -----------------------------------------------------------
+  Sub process
+----------------------------------------------------------- */
+
+mi_subproc_id_t mi_subproc_main(void) {
+  return NULL;
+}
+
+mi_subproc_id_t mi_subproc_new(void) {
+  mi_memid_t memid = _mi_memid_none();
+  mi_subproc_t* subproc = (mi_subproc_t*)_mi_arena_meta_zalloc(sizeof(mi_subproc_t), &memid);
+  if (subproc == NULL) return NULL;
+  subproc->memid = memid;
+  subproc->abandoned_os_list = NULL;
+  mi_lock_init(&subproc->abandoned_os_lock);
+  mi_lock_init(&subproc->abandoned_os_visit_lock);
+  return subproc;
+}
+
+mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id) {
+  return (subproc_id == NULL ? &mi_subproc_default : (mi_subproc_t*)subproc_id);
+}
+
+void mi_subproc_delete(mi_subproc_id_t subproc_id) {
+  if (subproc_id == NULL) return;
+  mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id);
+  // check if there are no abandoned segments still..
+  bool safe_to_delete = false;
+  mi_lock(&subproc->abandoned_os_lock) {
+    if (subproc->abandoned_os_list == NULL) {
+      safe_to_delete = true;
+    }
+  }
+  if (!safe_to_delete) return;
+  // safe to release
+  // todo: should we refcount subprocesses?
+  mi_lock_done(&subproc->abandoned_os_lock);
+  mi_lock_done(&subproc->abandoned_os_visit_lock);
+  _mi_arena_meta_free(subproc, subproc->memid, sizeof(mi_subproc_t));
+}
+
+void mi_subproc_add_current_thread(mi_subproc_id_t subproc_id) {
+  mi_heap_t* heap = mi_heap_get_default();
+  if (heap == NULL) return;
+  mi_assert(heap->tld->segments.subproc == &mi_subproc_default);
+  if (heap->tld->segments.subproc != &mi_subproc_default) return;
+  heap->tld->segments.subproc = _mi_subproc_from_id(subproc_id);
+}
+
+
+
 /* -----------------------------------------------------------
   Initialization and freeing of the thread local heaps
 ----------------------------------------------------------- */
@@ -218,7 +316,7 @@ typedef struct mi_thread_data_s {
 // destroy many OS threads, this may causes too much overhead
 // per thread so we maintain a small cache of recently freed metadata.
 
-#define TD_CACHE_SIZE (16)
+#define TD_CACHE_SIZE (32)
 static _Atomic(mi_thread_data_t*) td_cache[TD_CACHE_SIZE];
 
 static mi_thread_data_t* mi_thread_data_zalloc(void) {
@@ -239,10 +337,10 @@ static mi_thread_data_t* mi_thread_data_zalloc(void) {
   // if that fails, allocate as meta data
   if (td == NULL) {
     mi_memid_t memid;
-    td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &memid, &_mi_stats_main);
+    td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &memid);
     if (td == NULL) {
       // if this fails, try once more. (issue #257)
-      td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &memid, &_mi_stats_main);
+      td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &memid);
       if (td == NULL) {
         // really out of memory
         _mi_error_message(ENOMEM, "unable to allocate thread local heap metadata (%zu bytes)\n", sizeof(mi_thread_data_t));
@@ -272,7 +370,7 @@ static void mi_thread_data_free( mi_thread_data_t* tdfree ) {
     }
   }
   // if that fails, just free it directly
-  _mi_os_free(tdfree, sizeof(mi_thread_data_t), tdfree->memid, &_mi_stats_main);
+  _mi_os_free(tdfree, sizeof(mi_thread_data_t), tdfree->memid);
 }
 
 void _mi_thread_data_collect(void) {
@@ -282,7 +380,7 @@ void _mi_thread_data_collect(void) {
     if (td != NULL) {
       td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL);
       if (td != NULL) {
-        _mi_os_free(td, sizeof(mi_thread_data_t), td->memid, &_mi_stats_main);
+        _mi_os_free(td, sizeof(mi_thread_data_t), td->memid);
       }
     }
   }
@@ -307,7 +405,7 @@ static bool _mi_thread_heap_init(void) {
     mi_heap_t* heap = &td->heap;
     _mi_tld_init(tld, heap);  // must be before `_mi_heap_init`
     _mi_heap_init(heap, tld, _mi_arena_id_none(), false /* can reclaim */, 0 /* default tag */);
-    _mi_heap_set_default_direct(heap);   
+    _mi_heap_set_default_direct(heap);
   }
   return false;
 }
@@ -317,9 +415,8 @@ void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) {
   _mi_memcpy_aligned(tld, &tld_empty, sizeof(mi_tld_t));
   tld->heap_backing = bheap;
   tld->heaps = NULL;
+  tld->segments.subproc = &mi_subproc_default;
   tld->segments.stats = &tld->stats;
-  tld->segments.os = &tld->os;
-  tld->os.stats = &tld->stats;
 }
 
 // Free the thread local default heap (called from `mi_thread_done`)
@@ -476,54 +573,15 @@ void _mi_heap_set_default_direct(mi_heap_t* heap)  {
 // --------------------------------------------------------
 // Run functions on process init/done, and thread init/done
 // --------------------------------------------------------
-static void mi_cdecl mi_process_done(void);
-
 static bool os_preloading = true;    // true until this module is initialized
-static bool mi_redirected = false;   // true if malloc redirects to mi_malloc
 
 // Returns true if this module has not been initialized; Don't use C runtime routines until it returns false.
 bool mi_decl_noinline _mi_preloading(void) {
   return os_preloading;
 }
 
-mi_decl_nodiscard bool mi_is_redirected(void) mi_attr_noexcept {
-  return mi_redirected;
-}
-
-// Communicate with the redirection module on Windows
-#if defined(_WIN32) && defined(MI_SHARED_LIB) && !defined(MI_WIN_NOREDIRECT)
-#ifdef __cplusplus
-extern "C" {
-#endif
-mi_decl_export void _mi_redirect_entry(DWORD reason) {
-  // called on redirection; careful as this may be called before DllMain
-  if (reason == DLL_PROCESS_ATTACH) {
-    mi_redirected = true;
-  }
-  else if (reason == DLL_PROCESS_DETACH) {
-    mi_redirected = false;
-  }
-  else if (reason == DLL_THREAD_DETACH) {
-    mi_thread_done();
-  }
-}
-__declspec(dllimport) bool mi_cdecl mi_allocator_init(const char** message);
-__declspec(dllimport) void mi_cdecl mi_allocator_done(void);
-#ifdef __cplusplus
-}
-#endif
-#else
-static bool mi_allocator_init(const char** message) {
-  if (message != NULL) *message = NULL;
-  return true;
-}
-static void mi_allocator_done(void) {
-  // nothing to do
-}
-#endif
-
-// Called once by the process loader
-static void mi_process_load(void) {
+// Called once by the process loader from `src/prim/prim.c`
+void _mi_process_load(void) {
   mi_heap_main_init();
   #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
   volatile mi_heap_t* dummy = _mi_heap_default; // access TLS to allocate it before setting tls_initialized to true;
@@ -531,17 +589,14 @@ static void mi_process_load(void) {
   #endif
   os_preloading = false;
   mi_assert_internal(_mi_is_main_thread());
-  #if !(defined(_WIN32) && defined(MI_SHARED_LIB))  // use Dll process detach (see below) instead of atexit (issue #521)
-  atexit(&mi_process_done);
-  #endif
   _mi_options_init();
   mi_process_setup_auto_thread_done();
   mi_process_init();
-  if (mi_redirected) _mi_verbose_message("malloc is redirected.\n");
+  if (_mi_is_redirected()) _mi_verbose_message("malloc is redirected.\n");
 
   // show message from the redirector (if present)
   const char* msg = NULL;
-  mi_allocator_init(&msg);
+  _mi_allocator_init(&msg);
   if (msg != NULL && (mi_option_is_enabled(mi_option_verbose) || mi_option_is_enabled(mi_option_show_errors))) {
     _mi_fputs(NULL,NULL,NULL,msg);
   }
@@ -553,12 +608,15 @@ static void mi_process_load(void) {
 #if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
 #include <intrin.h>
 mi_decl_cache_align bool _mi_cpu_has_fsrm = false;
+mi_decl_cache_align bool _mi_cpu_has_erms = false;
 
 static void mi_detect_cpu_features(void) {
-  // FSRM for fast rep movsb support (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017))
+  // FSRM for fast short rep movsb/stosb support (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017))
+  // EMRS for fast enhanced rep movsb/stosb support
   int32_t cpu_info[4];
   __cpuid(cpu_info, 7);
   _mi_cpu_has_fsrm = ((cpu_info[3] & (1 << 4)) != 0); // bit 4 of EDX : see <https://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features>
+  _mi_cpu_has_erms = ((cpu_info[2] & (1 << 9)) != 0); // bit 9 of ECX : see <https://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features>
 }
 #else
 static void mi_detect_cpu_features(void) {
@@ -619,7 +677,7 @@ void mi_process_init(void) mi_attr_noexcept {
 }
 
 // Called when the process is done (through `at_exit`)
-static void mi_cdecl mi_process_done(void) {
+void mi_cdecl _mi_process_done(void) {
   // only shutdown if we were initialized
   if (!_mi_process_is_initialized) return;
   // ensure we are called once
@@ -627,15 +685,20 @@ static void mi_cdecl mi_process_done(void) {
   if (process_done) return;
   process_done = true;
 
+  // get the default heap so we don't need to acces thread locals anymore
+  mi_heap_t* heap = mi_prim_get_default_heap();  // use prim to not initialize any heap
+  mi_assert_internal(heap != NULL);
+
   // release any thread specific resources and ensure _mi_thread_done is called on all but the main thread
   _mi_prim_thread_done_auto_done();
 
+
   #ifndef MI_SKIP_COLLECT_ON_EXIT
     #if (MI_DEBUG || !defined(MI_SHARED_LIB))
     // free all memory if possible on process exit. This is not needed for a stand-alone process
     // but should be done if mimalloc is statically linked into another shared library which
     // is repeatedly loaded/unloaded, see issue #281.
-    mi_collect(true /* force */ );
+    mi_heap_collect(heap, true /* force */ );
     #endif
   #endif
 
@@ -643,72 +706,17 @@ static void mi_cdecl mi_process_done(void) {
   // since after process_done there might still be other code running that calls `free` (like at_exit routines,
   // or C-runtime termination code.
   if (mi_option_is_enabled(mi_option_destroy_on_exit)) {
-    mi_collect(true /* force */);
-    _mi_heap_unsafe_destroy_all();     // forcefully release all memory held by all heaps (of this thread only!)
-    _mi_arena_unsafe_destroy_all(& _mi_heap_main_get()->tld->stats);
+    mi_heap_collect(heap, true /* force */);
+    _mi_heap_unsafe_destroy_all(heap);     // forcefully release all memory held by all heaps (of this thread only!)
+    _mi_arena_unsafe_destroy_all();
+    _mi_segment_map_unsafe_destroy();
   }
 
   if (mi_option_is_enabled(mi_option_show_stats) || mi_option_is_enabled(mi_option_verbose)) {
     mi_stats_print(NULL);
   }
-  mi_allocator_done();
+  _mi_allocator_done();
   _mi_verbose_message("process done: 0x%zx\n", _mi_heap_main.thread_id);
   os_preloading = true; // don't call the C runtime anymore
 }
 
-
-
-#if defined(_WIN32) && defined(MI_SHARED_LIB)
-  // Windows DLL: easy to hook into process_init and thread_done
-  __declspec(dllexport) BOOL WINAPI DllMain(HINSTANCE inst, DWORD reason, LPVOID reserved) {
-    MI_UNUSED(reserved);
-    MI_UNUSED(inst);
-    if (reason==DLL_PROCESS_ATTACH) {
-      mi_process_load();
-    }
-    else if (reason==DLL_PROCESS_DETACH) {
-      mi_process_done();
-    }
-    else if (reason==DLL_THREAD_DETACH) {
-      if (!mi_is_redirected()) {
-        mi_thread_done();
-      }
-    }
-    return TRUE;
-  }
-
-#elif defined(_MSC_VER)
-  // MSVC: use data section magic for static libraries
-  // See <https://www.codeguru.com/cpp/misc/misc/applicationcontrol/article.php/c6945/Running-Code-Before-and-After-Main.htm>
-  static int _mi_process_init(void) {
-    mi_process_load();
-    return 0;
-  }
-  typedef int(*_mi_crt_callback_t)(void);
-  #if defined(_M_X64) || defined(_M_ARM64)
-    __pragma(comment(linker, "/include:" "_mi_msvc_initu"))
-    #pragma section(".CRT$XIU", long, read)
-  #else
-    __pragma(comment(linker, "/include:" "__mi_msvc_initu"))
-  #endif
-  #pragma data_seg(".CRT$XIU")
-  mi_decl_externc _mi_crt_callback_t _mi_msvc_initu[] = { &_mi_process_init };
-  #pragma data_seg()
-
-#elif defined(__cplusplus)
-  // C++: use static initialization to detect process start
-  static bool _mi_process_init(void) {
-    mi_process_load();
-    return (_mi_heap_main.thread_id != 0);
-  }
-  static bool mi_initialized = _mi_process_init();
-
-#elif defined(__GNUC__) || defined(__clang__)
-  // GCC,Clang: use the constructor attribute
-  static void __attribute__((constructor)) _mi_process_init(void) {
-    mi_process_load();
-  }
-
-#else
-#pragma message("define a way to call mi_process_load on your platform")
-#endif
diff --git a/third-party/mimalloc/src/libc.c b/third-party/mimalloc/src/libc.c
index dd6b400737..ce541f1b52 100644
--- a/third-party/mimalloc/src/libc.c
+++ b/third-party/mimalloc/src/libc.c
@@ -130,7 +130,7 @@ static void mi_out_alignright(char fill, char* start, size_t len, size_t extra,
 }
 
 
-static void mi_out_num(uintptr_t x, size_t base, char prefix, char** out, char* end) 
+static void mi_out_num(uintmax_t x, size_t base, char prefix, char** out, char* end) 
 {
   if (x == 0 || base == 0 || base > 16) {
     if (prefix != 0) { mi_outc(prefix, out, end); }
@@ -206,12 +206,13 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
       }
       else if (c == 'p' || c == 'x' || c == 'u') {
         // unsigned
-        uintptr_t x = 0;
+        uintmax_t x = 0;
         if (c == 'x' || c == 'u') {
           if (numtype == 'z')       x = va_arg(args, size_t);
           else if (numtype == 't')  x = va_arg(args, uintptr_t); // unsigned ptrdiff_t
-          else if (numtype == 'L')  x = (uintptr_t)va_arg(args, unsigned long long);
-                               else x = va_arg(args, unsigned long);
+          else if (numtype == 'L')  x = va_arg(args, unsigned long long);
+          else if (numtype == 'l')  x = va_arg(args, unsigned long);
+                               else x = va_arg(args, unsigned int);
         }
         else if (c == 'p') {
           x = va_arg(args, uintptr_t);
@@ -228,20 +229,21 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
       }
       else if (c == 'i' || c == 'd') {
         // signed
-        intptr_t x = 0;
+        intmax_t x = 0;
         if (numtype == 'z')       x = va_arg(args, intptr_t );
         else if (numtype == 't')  x = va_arg(args, ptrdiff_t);
-        else if (numtype == 'L')  x = (intptr_t)va_arg(args, long long);
-                             else x = va_arg(args, long);
+        else if (numtype == 'L')  x = va_arg(args, long long);
+        else if (numtype == 'l')  x = va_arg(args, long);
+                             else x = va_arg(args, int);
         char pre = 0;
         if (x < 0) {
           pre = '-';
-          if (x > INTPTR_MIN) { x = -x; }
+          if (x > INTMAX_MIN) { x = -x; }
         }
         else if (numplus != 0) {
           pre = numplus;
         }
-        mi_out_num((uintptr_t)x, 10, pre, &out, end);
+        mi_out_num((uintmax_t)x, 10, pre, &out, end);
       }
       else if (c >= ' ' && c <= '~') {
         // unknown format
diff --git a/third-party/mimalloc/src/options.c b/third-party/mimalloc/src/options.c
index a62727dd69..d7fa666517 100644
--- a/third-party/mimalloc/src/options.c
+++ b/third-party/mimalloc/src/options.c
@@ -47,6 +47,62 @@ typedef struct mi_option_desc_s {
 #define MI_OPTION(opt)                  mi_option_##opt, #opt, NULL
 #define MI_OPTION_LEGACY(opt,legacy)    mi_option_##opt, #opt, #legacy
 
+// Some options can be set at build time for statically linked libraries
+// (use `-DMI_EXTRA_CPPDEFS="opt1=val1;opt2=val2"`)
+//
+// This is useful if we cannot pass them as environment variables
+// (and setting them programmatically would be too late)
+
+#ifndef MI_DEFAULT_VERBOSE
+#define MI_DEFAULT_VERBOSE 0
+#endif
+
+#ifndef MI_DEFAULT_EAGER_COMMIT
+#define MI_DEFAULT_EAGER_COMMIT 1
+#endif
+
+#ifndef MI_DEFAULT_ARENA_EAGER_COMMIT
+#define MI_DEFAULT_ARENA_EAGER_COMMIT 2
+#endif
+
+// in KiB
+#ifndef MI_DEFAULT_ARENA_RESERVE
+ #if (MI_INTPTR_SIZE>4)
+  #define MI_DEFAULT_ARENA_RESERVE 1024L*1024L
+ #else
+  #define MI_DEFAULT_ARENA_RESERVE 128L*1024L
+ #endif
+#endif
+
+#ifndef MI_DEFAULT_DISALLOW_ARENA_ALLOC
+#define MI_DEFAULT_DISALLOW_ARENA_ALLOC 0
+#endif
+
+#ifndef MI_DEFAULT_ALLOW_LARGE_OS_PAGES
+#if defined(__linux__) && !defined(__ANDROID__)
+#define MI_DEFAULT_ALLOW_LARGE_OS_PAGES 2    // enabled, but only use transparent huge pages through madvise
+#else
+#define MI_DEFAULT_ALLOW_LARGE_OS_PAGES 0
+#endif
+#endif
+
+#ifndef MI_DEFAULT_RESERVE_HUGE_OS_PAGES
+#define MI_DEFAULT_RESERVE_HUGE_OS_PAGES 0
+#endif
+
+#ifndef MI_DEFAULT_RESERVE_OS_MEMORY
+#define MI_DEFAULT_RESERVE_OS_MEMORY 0
+#endif
+
+#ifndef MI_DEFAULT_GUARDED_SAMPLE_RATE
+#if MI_GUARDED
+#define MI_DEFAULT_GUARDED_SAMPLE_RATE 4000
+#else
+#define MI_DEFAULT_GUARDED_SAMPLE_RATE 0
+#endif
+#endif
+
+
 static mi_option_desc_t options[_mi_option_last] =
 {
   // stable options
@@ -56,16 +112,21 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0, UNINIT, MI_OPTION(show_errors) },
   #endif
   { 0, UNINIT, MI_OPTION(show_stats) },
-  { 0, UNINIT, MI_OPTION(verbose) },
+  { MI_DEFAULT_VERBOSE, UNINIT, MI_OPTION(verbose) },
 
-  // the following options are experimental and not all combinations make sense.
-  { 1, UNINIT, MI_OPTION(eager_commit) },               // commit per segment directly (4MiB)  (but see also `eager_commit_delay`)
-  { 2, UNINIT, MI_OPTION_LEGACY(arena_eager_commit,eager_region_commit) }, // eager commit arena's? 2 is used to enable this only on an OS that has overcommit (i.e. linux)
+  // some of the following options are experimental and not all combinations are allowed.
+  { MI_DEFAULT_EAGER_COMMIT,
+       UNINIT, MI_OPTION(eager_commit) },               // commit per segment directly (4MiB)  (but see also `eager_commit_delay`)
+  { MI_DEFAULT_ARENA_EAGER_COMMIT,
+       UNINIT, MI_OPTION_LEGACY(arena_eager_commit,eager_region_commit) }, // eager commit arena's? 2 is used to enable this only on an OS that has overcommit (i.e. linux)
   { 1, UNINIT, MI_OPTION_LEGACY(purge_decommits,reset_decommits) },        // purge decommits memory (instead of reset) (note: on linux this uses MADV_DONTNEED for decommit)
-  { 0, UNINIT, MI_OPTION_LEGACY(allow_large_os_pages,large_os_pages) },    // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
-  { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) },      // per 1GiB huge pages
+  { MI_DEFAULT_ALLOW_LARGE_OS_PAGES,
+       UNINIT, MI_OPTION_LEGACY(allow_large_os_pages,large_os_pages) },    // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
+  { MI_DEFAULT_RESERVE_HUGE_OS_PAGES,
+       UNINIT, MI_OPTION(reserve_huge_os_pages) },      // per 1GiB huge pages
   {-1, UNINIT, MI_OPTION(reserve_huge_os_pages_at) },   // reserve huge pages at node N
-  { 0, UNINIT, MI_OPTION(reserve_os_memory)     },      // reserve N KiB OS memory in advance (use `option_get_size`)
+  { MI_DEFAULT_RESERVE_OS_MEMORY,
+       UNINIT, MI_OPTION(reserve_os_memory)     },      // reserve N KiB OS memory in advance (use `option_get_size`)
   { 0, UNINIT, MI_OPTION(deprecated_segment_cache) },   // cache N segments per thread
   { 0, UNINIT, MI_OPTION(deprecated_page_reset) },      // reset page memory on free
   { 0, UNINIT, MI_OPTION_LEGACY(abandoned_page_purge,abandoned_page_reset) },       // reset free page memory when a thread terminates
@@ -83,16 +144,24 @@ static mi_option_desc_t options[_mi_option_last] =
   { 32,  UNINIT, MI_OPTION(max_warnings) },             // maximum warnings that are output
   { 10,  UNINIT, MI_OPTION(max_segment_reclaim)},       // max. percentage of the abandoned segments to be reclaimed per try.
   { 0,   UNINIT, MI_OPTION(destroy_on_exit)},           // release all OS memory on process exit; careful with dangling pointer or after-exit frees!
-  #if (MI_INTPTR_SIZE>4)
-  { 1024L*1024L, UNINIT, MI_OPTION(arena_reserve) },    // reserve memory N KiB at a time (=1GiB) (use `option_get_size`)
-  #else
-  {  128L*1024L, UNINIT, MI_OPTION(arena_reserve) },    // =128MiB on 32-bit
-  #endif
-  { 10,  UNINIT, MI_OPTION(arena_purge_mult) },        // purge delay multiplier for arena's
+  { MI_DEFAULT_ARENA_RESERVE, UNINIT, MI_OPTION(arena_reserve) }, // reserve memory N KiB at a time (=1GiB) (use `option_get_size`)
+  { 10,  UNINIT, MI_OPTION(arena_purge_mult) },         // purge delay multiplier for arena's
   { 1,   UNINIT, MI_OPTION_LEGACY(purge_extend_delay, decommit_extend_delay) },
-  { 1,   UNINIT, MI_OPTION(abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free
-  { 0,   UNINIT, MI_OPTION(disallow_arena_alloc) },     // 1 = do not use arena's for allocation (except if using specific arena id's)
+  { 0,   UNINIT, MI_OPTION(abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free
+  { MI_DEFAULT_DISALLOW_ARENA_ALLOC,   UNINIT, MI_OPTION(disallow_arena_alloc) }, // 1 = do not use arena's for allocation (except if using specific arena id's)
   { 400, UNINIT, MI_OPTION(retry_on_oom) },             // windows only: retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries.
+#if defined(MI_VISIT_ABANDONED)
+  { 1,   INITIALIZED, MI_OPTION(visit_abandoned) },     // allow visiting heap blocks in abandoned segments; requires taking locks during reclaim.
+#else
+  { 0,   UNINIT, MI_OPTION(visit_abandoned) },
+#endif
+  { 0,   UNINIT, MI_OPTION(guarded_min) },              // only used when building with MI_GUARDED: minimal rounded object size for guarded objects
+  { MI_GiB, UNINIT, MI_OPTION(guarded_max) },           // only used when building with MI_GUARDED: maximal rounded object size for guarded objects
+  { 0,   UNINIT, MI_OPTION(guarded_precise) },          // disregard minimal alignment requirement to always place guarded blocks exactly in front of a guard page (=0)
+  { MI_DEFAULT_GUARDED_SAMPLE_RATE,
+         UNINIT, MI_OPTION(guarded_sample_rate)},       // 1 out of N allocations in the min/max range will be guarded (=4000)
+  { 0,   UNINIT, MI_OPTION(guarded_sample_seed)},
+  { 0,   UNINIT, MI_OPTION(target_segments_per_thread) }, // abandon segments beyond this point, or 0 to disable.
 };
 
 static void mi_option_init(mi_option_desc_t* desc);
@@ -102,8 +171,7 @@ static bool mi_option_has_size_in_kib(mi_option_t option) {
 }
 
 void _mi_options_init(void) {
-  // called on process load; should not be called before the CRT is initialized!
-  // (e.g. do not call this from process_init as that may run before CRT initialization)
+  // called on process load
   mi_add_stderr_output(); // now it safe to use stderr for output
   for(int i = 0; i < _mi_option_last; i++ ) {
     mi_option_t option = (mi_option_t)i;
@@ -116,8 +184,26 @@ void _mi_options_init(void) {
   }
   mi_max_error_count = mi_option_get(mi_option_max_errors);
   mi_max_warning_count = mi_option_get(mi_option_max_warnings);
+  #if MI_GUARDED
+  if (mi_option_get(mi_option_guarded_sample_rate) > 0) {
+    if (mi_option_is_enabled(mi_option_allow_large_os_pages)) {
+      mi_option_disable(mi_option_allow_large_os_pages);
+      _mi_warning_message("option 'allow_large_os_pages' is disabled to allow for guarded objects\n");
+    }
+  }
+  _mi_verbose_message("guarded build: %s\n", mi_option_get(mi_option_guarded_sample_rate) != 0 ? "enabled" : "disabled");
+  #endif
 }
 
+long _mi_option_get_fast(mi_option_t option) {
+  mi_assert(option >= 0 && option < _mi_option_last);
+  mi_option_desc_t* desc = &options[option];
+  mi_assert(desc->option == option);  // index should match the option
+  //mi_assert(desc->init != UNINIT);
+  return desc->value;
+}
+
+
 mi_decl_nodiscard long mi_option_get(mi_option_t option) {
   mi_assert(option >= 0 && option < _mi_option_last);
   if (option < 0 || option >= _mi_option_last) return 0;
@@ -135,7 +221,6 @@ mi_decl_nodiscard long mi_option_get_clamp(mi_option_t option, long min, long ma
 }
 
 mi_decl_nodiscard size_t mi_option_get_size(mi_option_t option) {
-  mi_assert_internal(mi_option_has_size_in_kib(option));
   const long x = mi_option_get(option);
   size_t size = (x < 0 ? 0 : (size_t)x);
   if (mi_option_has_size_in_kib(option)) {
@@ -151,6 +236,13 @@ void mi_option_set(mi_option_t option, long value) {
   mi_assert(desc->option == option);  // index should match the option
   desc->value = value;
   desc->init = INITIALIZED;
+  // ensure min/max range; be careful to not recurse.
+  if (desc->option == mi_option_guarded_min && _mi_option_get_fast(mi_option_guarded_max) < value) {
+    mi_option_set(mi_option_guarded_max, value);
+  }
+  else if (desc->option == mi_option_guarded_max && _mi_option_get_fast(mi_option_guarded_min) > value) {
+    mi_option_set(mi_option_guarded_min, value);
+  }
 }
 
 void mi_option_set_default(mi_option_t option, long value) {
@@ -194,7 +286,7 @@ static void mi_cdecl mi_out_stderr(const char* msg, void* arg) {
 // an output function is registered it is called immediately with
 // the output up to that point.
 #ifndef MI_MAX_DELAY_OUTPUT
-#define MI_MAX_DELAY_OUTPUT ((size_t)(32*1024))
+#define MI_MAX_DELAY_OUTPUT ((size_t)(16*1024))
 #endif
 static char out_buf[MI_MAX_DELAY_OUTPUT+1];
 static _Atomic(size_t) out_len;
@@ -280,7 +372,7 @@ static _Atomic(size_t) warning_count; // = 0;  // when >= max_warning_count stop
 // (recursively) invoke malloc again to allocate space for the thread local
 // variables on demand. This is why we use a _mi_preloading test on such
 // platforms. However, C code generator may move the initial thread local address
-// load before the `if` and we therefore split it out in a separate funcion.
+// load before the `if` and we therefore split it out in a separate function.
 static mi_decl_thread bool recurse = false;
 
 static mi_decl_noinline bool mi_recurse_enter_prim(void) {
@@ -485,7 +577,7 @@ static void mi_option_init(mi_option_desc_t* desc) {
       char* end = buf;
       long value = strtol(buf, &end, 10);
       if (mi_option_has_size_in_kib(desc->option)) {
-        // this option is interpreted in KiB to prevent overflow of `long` for large allocations 
+        // this option is interpreted in KiB to prevent overflow of `long` for large allocations
         // (long is 32-bit on 64-bit windows, which allows for 4TiB max.)
         size_t size = (value < 0 ? 0 : (size_t)value);
         bool overflow = false;
@@ -500,8 +592,7 @@ static void mi_option_init(mi_option_desc_t* desc) {
         value = (size > LONG_MAX ? LONG_MAX : (long)size);
       }
       if (*end == 0) {
-        desc->value = value;
-        desc->init = INITIALIZED;
+        mi_option_set(desc->option, value);
       }
       else {
         // set `init` first to avoid recursion through _mi_warning_message on mimalloc_verbose.
diff --git a/third-party/mimalloc/src/os.c b/third-party/mimalloc/src/os.c
index ce104273bf..61c9eebfe9 100644
--- a/third-party/mimalloc/src/os.c
+++ b/third-party/mimalloc/src/os.c
@@ -9,18 +9,38 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc/atomic.h"
 #include "mimalloc/prim.h"
 
+#define mi_os_stat_increase(stat,amount)      _mi_stat_increase(&_mi_stats_main.stat, amount)
+#define mi_os_stat_decrease(stat,amount)      _mi_stat_decrease(&_mi_stats_main.stat, amount)
+#define mi_os_stat_counter_increase(stat,inc) _mi_stat_counter_increase(&_mi_stats_main.stat, inc)
 
 /* -----------------------------------------------------------
-  Initialization. 
+  Initialization.
 ----------------------------------------------------------- */
+#ifndef MI_DEFAULT_VIRTUAL_ADDRESS_BITS
+#if MI_INTPTR_SIZE < 8
+#define MI_DEFAULT_VIRTUAL_ADDRESS_BITS   32
+#else
+#define MI_DEFAULT_VIRTUAL_ADDRESS_BITS   48
+#endif
+#endif
+
+#ifndef MI_DEFAULT_PHYSICAL_MEMORY
+#if MI_INTPTR_SIZE < 8
+#define MI_DEFAULT_PHYSICAL_MEMORY    4*MI_GiB
+#else
+#define MI_DEFAULT_PHYSICAL_MEMORY    32*MI_GiB
+#endif
+#endif
 
 static mi_os_mem_config_t mi_os_mem_config = {
-  4096,   // page size
-  0,      // large page size (usually 2MiB)
-  4096,   // allocation granularity
-  true,   // has overcommit?  (if true we use MAP_NORESERVE on mmap systems)
-  false,  // can we partially free allocated blocks? (on mmap systems we can free anywhere in a mapped range, but on Windows we must free the entire span)
-  true    // has virtual reserve? (if true we can reserve virtual address space without using commit or physical memory)
+  4096,     // page size
+  0,        // large page size (usually 2MiB)
+  4096,     // allocation granularity
+  MI_DEFAULT_PHYSICAL_MEMORY,
+  MI_DEFAULT_VIRTUAL_ADDRESS_BITS,
+  true,     // has overcommit?  (if true we use MAP_NORESERVE on mmap systems)
+  false,    // can we partially free allocated blocks? (on mmap systems we can free anywhere in a mapped range, but on Windows we must free the entire span)
+  true      // has virtual reserve? (if true we can reserve virtual address space without using commit or physical memory)
 };
 
 bool _mi_os_has_overcommit(void) {
@@ -68,17 +88,18 @@ void _mi_os_init(void) {
 /* -----------------------------------------------------------
   Util
 -------------------------------------------------------------- */
-bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
-bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats);
+bool _mi_os_decommit(void* addr, size_t size);
+bool _mi_os_commit(void* addr, size_t size, bool* is_zero);
 
 
 /* -----------------------------------------------------------
   aligned hinting
 -------------------------------------------------------------- */
 
-// On 64-bit systems, we can do efficient aligned allocation by using
-// the 2TiB to 30TiB area to allocate those.
-#if (MI_INTPTR_SIZE >= 8)
+// On systems with enough virtual address bits, we can do efficient aligned allocation by using
+// the 2TiB to 30TiB area to allocate those. If we have at least 46 bits of virtual address
+// space (64TiB) we use this technique. (but see issue #939)
+#if (MI_INTPTR_SIZE >= 8) && !defined(MI_NO_ALIGNED_HINT)
 static mi_decl_cache_align _Atomic(uintptr_t)aligned_base;
 
 // Return a MI_SEGMENT_SIZE aligned address that is probably available.
@@ -95,6 +116,7 @@ static mi_decl_cache_align _Atomic(uintptr_t)aligned_base;
 void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size)
 {
   if (try_alignment <= 1 || try_alignment > MI_SEGMENT_SIZE) return NULL;
+  if (mi_os_mem_config.virtual_address_bits < 46) return NULL;  // < 64TiB virtual address space
   size = _mi_align_up(size, MI_SEGMENT_SIZE);
   if (size > 1*MI_GiB) return NULL;  // guarantee the chance of fixed valid address is at most 1/(MI_HINT_AREA / 1<<30) = 1/4096.
   #if (MI_SECURE>0)
@@ -122,44 +144,50 @@ void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
 }
 #endif
 
-
 /* -----------------------------------------------------------
   Free memory
 -------------------------------------------------------------- */
 
-static void mi_os_free_huge_os_pages(void* p, size_t size, mi_stats_t* stats);
+static void mi_os_free_huge_os_pages(void* p, size_t size);
 
-static void mi_os_prim_free(void* addr, size_t size, bool still_committed, mi_stats_t* tld_stats) {
-  MI_UNUSED(tld_stats);
-  mi_stats_t* stats = &_mi_stats_main;
+static void mi_os_prim_free(void* addr, size_t size, size_t commit_size) {
   mi_assert_internal((size % _mi_os_page_size()) == 0);
   if (addr == NULL || size == 0) return; // || _mi_os_is_huge_reserved(addr)
   int err = _mi_prim_free(addr, size);
   if (err != 0) {
     _mi_warning_message("unable to free OS memory (error: %d (0x%x), size: 0x%zx bytes, address: %p)\n", err, err, size, addr);
   }
-  if (still_committed) { _mi_stat_decrease(&stats->committed, size); }
-  _mi_stat_decrease(&stats->reserved, size);
+  if (commit_size > 0) {
+    mi_os_stat_decrease(committed, commit_size);
+  }
+  mi_os_stat_decrease(reserved, size);
 }
 
-void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t memid, mi_stats_t* tld_stats) {
+void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t memid) {
   if (mi_memkind_is_os(memid.memkind)) {
-    size_t csize = _mi_os_good_alloc_size(size);
+    size_t csize = memid.mem.os.size;
+    if (csize==0) { _mi_os_good_alloc_size(size); }
+    size_t commit_size = (still_committed ? csize : 0);
     void* base = addr;
     // different base? (due to alignment)
-    if (memid.mem.os.base != NULL) {
-      mi_assert(memid.mem.os.base <= addr);
-      mi_assert((uint8_t*)memid.mem.os.base + memid.mem.os.alignment >= (uint8_t*)addr);
+    if (memid.mem.os.base != base) {
+      mi_assert(memid.mem.os.base <= addr);      
       base = memid.mem.os.base;
-      csize += ((uint8_t*)addr - (uint8_t*)memid.mem.os.base);
+      const size_t diff = (uint8_t*)addr - (uint8_t*)memid.mem.os.base;
+      if (memid.mem.os.size==0) { 
+        csize += diff;
+      }
+      if (still_committed) {
+        commit_size -= diff;  // the (addr-base) part was already un-committed
+      }
     }
     // free it
     if (memid.memkind == MI_MEM_OS_HUGE) {
       mi_assert(memid.is_pinned);
-      mi_os_free_huge_os_pages(base, csize, tld_stats);
+      mi_os_free_huge_os_pages(base, csize);
     }
     else {
-      mi_os_prim_free(base, csize, still_committed, tld_stats);
+      mi_os_prim_free(base, csize, (still_committed ? commit_size : 0));
     }
   }
   else {
@@ -168,8 +196,8 @@ void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t me
   }
 }
 
-void  _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* tld_stats) {
-  _mi_os_free_ex(p, size, true, memid, tld_stats);
+void  _mi_os_free(void* p, size_t size, mi_memid_t memid) {
+  _mi_os_free_ex(p, size, true, memid);
 }
 
 
@@ -178,7 +206,8 @@ void  _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* tld_stats)
 -------------------------------------------------------------- */
 
 // Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
-static void* mi_os_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, mi_stats_t* tld_stats) {
+// Also `hint_addr` is a hint and may be ignored.
+static void* mi_os_prim_alloc_at(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero) {
   mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
   mi_assert_internal(is_zero != NULL);
   mi_assert_internal(is_large != NULL);
@@ -187,18 +216,18 @@ static void* mi_os_prim_alloc(size_t size, size_t try_alignment, bool commit, bo
   if (try_alignment == 0) { try_alignment = 1; } // avoid 0 to ensure there will be no divide by zero when aligning
   *is_zero = false;
   void* p = NULL;
-  int err = _mi_prim_alloc(size, try_alignment, commit, allow_large, is_large, is_zero, &p);
+  int err = _mi_prim_alloc(hint_addr, size, try_alignment, commit, allow_large, is_large, is_zero, &p);
   if (err != 0) {
-    _mi_warning_message("unable to allocate OS memory (error: %d (0x%x), size: 0x%zx bytes, align: 0x%zx, commit: %d, allow large: %d)\n", err, err, size, try_alignment, commit, allow_large);
+    _mi_warning_message("unable to allocate OS memory (error: %d (0x%x), addr: %p, size: 0x%zx bytes, align: 0x%zx, commit: %d, allow large: %d)\n", err, err, hint_addr, size, try_alignment, commit, allow_large);
   }
 
-  MI_UNUSED(tld_stats);
-  mi_stats_t* stats = &_mi_stats_main;
-  mi_stat_counter_increase(stats->mmap_calls, 1);
+
+
+  mi_os_stat_counter_increase(mmap_calls, 1);
   if (p != NULL) {
-    _mi_stat_increase(&stats->reserved, size);
+    mi_os_stat_increase(reserved, size);
     if (commit) {
-      _mi_stat_increase(&stats->committed, size);
+      mi_os_stat_increase(committed, size);
       // seems needed for asan (or `mimalloc-test-api` fails)
       #ifdef MI_TRACK_ASAN
       if (*is_zero) { mi_track_mem_defined(p,size); }
@@ -209,10 +238,14 @@ static void* mi_os_prim_alloc(size_t size, size_t try_alignment, bool commit, bo
   return p;
 }
 
+static void* mi_os_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero) {
+  return mi_os_prim_alloc_at(NULL, size, try_alignment, commit, allow_large, is_large, is_zero);
+}
+
 
 // Primitive aligned allocation from the OS.
 // This function guarantees the allocated memory is aligned.
-static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** base, mi_stats_t* stats) {
+static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** base) {
   mi_assert_internal(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0));
   mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
   mi_assert_internal(is_large != NULL);
@@ -222,8 +255,8 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
   if (!(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0))) return NULL;
   size = _mi_align_up(size, _mi_os_page_size());
 
-  // try first with a hint (this will be aligned directly on Win 10+ or BSD)
-  void* p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero, stats);
+  // try first with a requested alignment hint (this will usually be aligned directly on Win 10+ or BSD)
+  void* p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero);
   if (p == NULL) return NULL;
 
   // aligned already?
@@ -232,14 +265,16 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
   }
   else {
     // if not aligned, free it, overallocate, and unmap around it
+    #if !MI_TRACK_ASAN
     _mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (size: 0x%zx bytes, address: %p, alignment: 0x%zx, commit: %d)\n", size, p, alignment, commit);
-    mi_os_prim_free(p, size, commit, stats);
+    #endif
+    if (p != NULL) { mi_os_prim_free(p, size, (commit ? size : 0)); }
     if (size >= (SIZE_MAX - alignment)) return NULL; // overflow
     const size_t over_size = size + alignment;
 
     if (!mi_os_mem_config.has_partial_free) {  // win32 virtualAlloc cannot free parts of an allocated block
       // over-allocate uncommitted (virtual) memory
-      p = mi_os_prim_alloc(over_size, 1 /*alignment*/, false /* commit? */, false /* allow_large */, is_large, is_zero, stats);
+      p = mi_os_prim_alloc(over_size, 1 /*alignment*/, false /* commit? */, false /* allow_large */, is_large, is_zero);
       if (p == NULL) return NULL;
 
       // set p to the aligned part in the full region
@@ -250,22 +285,22 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
 
       // explicitly commit only the aligned part
       if (commit) {
-        _mi_os_commit(p, size, NULL, stats);
+        _mi_os_commit(p, size, NULL);
       }
     }
     else  { // mmap can free inside an allocation
       // overallocate...
-      p = mi_os_prim_alloc(over_size, 1, commit, false, is_large, is_zero, stats);
+      p = mi_os_prim_alloc(over_size, 1, commit, false, is_large, is_zero);
       if (p == NULL) return NULL;
 
-      // and selectively unmap parts around the over-allocated area. 
+      // and selectively unmap parts around the over-allocated area.
       void* aligned_p = mi_align_up_ptr(p, alignment);
       size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p;
       size_t mid_size = _mi_align_up(size, _mi_os_page_size());
       size_t post_size = over_size - pre_size - mid_size;
       mi_assert_internal(pre_size < over_size&& post_size < over_size&& mid_size >= size);
-      if (pre_size > 0)  { mi_os_prim_free(p, pre_size, commit, stats); }
-      if (post_size > 0) { mi_os_prim_free((uint8_t*)aligned_p + mid_size, post_size, commit, stats); }
+      if (pre_size > 0)  { mi_os_prim_free(p, pre_size, (commit ? pre_size : 0)); }
+      if (post_size > 0) { mi_os_prim_free((uint8_t*)aligned_p + mid_size, post_size, (commit ? post_size : 0)); }
       // we can return the aligned pointer on `mmap` systems
       p = aligned_p;
       *base = aligned_p; // since we freed the pre part, `*base == p`.
@@ -281,20 +316,20 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
   OS API: alloc and alloc_aligned
 ----------------------------------------------------------- */
 
-void* _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* stats) {
+void* _mi_os_alloc(size_t size, mi_memid_t* memid) {
   *memid = _mi_memid_none();
   if (size == 0) return NULL;
   size = _mi_os_good_alloc_size(size);
   bool os_is_large = false;
   bool os_is_zero  = false;
-  void* p = mi_os_prim_alloc(size, 0, true, false, &os_is_large, &os_is_zero, stats);
+  void* p = mi_os_prim_alloc(size, 0, true, false, &os_is_large, &os_is_zero);
   if (p != NULL) {
     *memid = _mi_memid_create_os(true, os_is_zero, os_is_large);
   }
   return p;
 }
 
-void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* stats)
+void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid)
 {
   MI_UNUSED(&_mi_os_get_aligned_hint); // suppress unused warnings
   *memid = _mi_memid_none();
@@ -305,11 +340,12 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
   bool os_is_large = false;
   bool os_is_zero  = false;
   void* os_base = NULL;
-  void* p = mi_os_prim_alloc_aligned(size, alignment, commit, allow_large, &os_is_large, &os_is_zero, &os_base, stats );
+  void* p = mi_os_prim_alloc_aligned(size, alignment, commit, allow_large, &os_is_large, &os_is_zero, &os_base );
   if (p != NULL) {
     *memid = _mi_memid_create_os(commit, os_is_zero, os_is_large);
     memid->mem.os.base = os_base;
-    memid->mem.os.alignment = alignment;
+    // memid->mem.os.alignment = alignment;
+    memid->mem.os.size += ((uint8_t*)p - (uint8_t*)os_base);  // todo: return from prim_alloc_aligned
   }
   return p;
 }
@@ -322,7 +358,7 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
   to use the actual start of the memory region.
 ----------------------------------------------------------- */
 
-void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t offset, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* stats) {
+void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t offset, bool commit, bool allow_large, mi_memid_t* memid) {
   mi_assert(offset <= MI_SEGMENT_SIZE);
   mi_assert(offset <= size);
   mi_assert((alignment % _mi_os_page_size()) == 0);
@@ -330,20 +366,20 @@ void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t offse
   if (offset > MI_SEGMENT_SIZE) return NULL;
   if (offset == 0) {
     // regular aligned allocation
-    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, stats);
+    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid);
   }
   else {
     // overallocate to align at an offset
     const size_t extra = _mi_align_up(offset, alignment) - offset;
     const size_t oversize = size + extra;
-    void* const start = _mi_os_alloc_aligned(oversize, alignment, commit, allow_large, memid, stats);
+    void* const start = _mi_os_alloc_aligned(oversize, alignment, commit, allow_large, memid);
     if (start == NULL) return NULL;
 
     void* const p = (uint8_t*)start + extra;
     mi_assert(_mi_is_aligned((uint8_t*)p + offset, alignment));
     // decommit the overallocation at the start
     if (commit && extra > _mi_os_page_size()) {
-      _mi_os_decommit(start, extra, stats);
+      _mi_os_decommit(start, extra);
     }
     return p;
   }
@@ -377,12 +413,10 @@ static void* mi_os_page_align_area_conservative(void* addr, size_t size, size_t*
   return mi_os_page_align_areax(true, addr, size, newsize);
 }
 
-bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats) {
-  MI_UNUSED(tld_stats);
-  mi_stats_t* stats = &_mi_stats_main;
+bool _mi_os_commit_ex(void* addr, size_t size, bool* is_zero, size_t stat_size) {
   if (is_zero != NULL) { *is_zero = false; }
-  _mi_stat_increase(&stats->committed, size);  // use size for precise commit vs. decommit
-  _mi_stat_counter_increase(&stats->commit_calls, 1);
+  mi_os_stat_increase(committed, stat_size);  // use size for precise commit vs. decommit
+  mi_os_stat_counter_increase(commit_calls, 1);
 
   // page align range
   size_t csize;
@@ -408,11 +442,13 @@ bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats
   return true;
 }
 
-static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit, mi_stats_t* tld_stats) {
-  MI_UNUSED(tld_stats);
-  mi_stats_t* stats = &_mi_stats_main;
+bool _mi_os_commit(void* addr, size_t size, bool* is_zero) {
+  return _mi_os_commit_ex(addr, size, is_zero, size);
+}
+
+static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit, size_t stat_size) {
   mi_assert_internal(needs_recommit!=NULL);
-  _mi_stat_decrease(&stats->committed, size);
+  mi_os_stat_decrease(committed, stat_size);
 
   // page align
   size_t csize;
@@ -429,9 +465,9 @@ static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit, mi_
   return (err == 0);
 }
 
-bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* tld_stats) {
+bool _mi_os_decommit(void* addr, size_t size) {
   bool needs_recommit;
-  return mi_os_decommit_ex(addr, size, &needs_recommit, tld_stats);
+  return mi_os_decommit_ex(addr, size, &needs_recommit, size);
 }
 
 
@@ -439,13 +475,13 @@ bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* tld_stats) {
 // but may be used later again. This will release physical memory
 // pages and reduce swapping while keeping the memory committed.
 // We page align to a conservative area inside the range to reset.
-bool _mi_os_reset(void* addr, size_t size, mi_stats_t* stats) {
+bool _mi_os_reset(void* addr, size_t size) {
   // page align conservatively within the range
   size_t csize;
   void* start = mi_os_page_align_area_conservative(addr, size, &csize);
   if (csize == 0) return true;  // || _mi_os_is_huge_reserved(addr)
-  _mi_stat_increase(&stats->reset, csize);
-  _mi_stat_counter_increase(&stats->reset_calls, 1);
+  mi_os_stat_increase(reset, csize);
+  mi_os_stat_counter_increase(reset_calls, 1);
 
   #if (MI_DEBUG>1) && !MI_SECURE && !MI_TRACK_ENABLED // && !MI_TSAN
   memset(start, 0, csize); // pretend it is eagerly reset
@@ -461,22 +497,22 @@ bool _mi_os_reset(void* addr, size_t size, mi_stats_t* stats) {
 
 // either resets or decommits memory, returns true if the memory needs
 // to be recommitted if it is to be re-used later on.
-bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, mi_stats_t* stats)
+bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, size_t stat_size)
 {
   if (mi_option_get(mi_option_purge_delay) < 0) return false;  // is purging allowed?
-  _mi_stat_counter_increase(&stats->purge_calls, 1);
-  _mi_stat_increase(&stats->purged, size);
+  mi_os_stat_counter_increase(purge_calls, 1);
+  mi_os_stat_increase(purged, size);
 
   if (mi_option_is_enabled(mi_option_purge_decommits) &&   // should decommit?
       !_mi_preloading())                                   // don't decommit during preloading (unsafe)
   {
     bool needs_recommit = true;
-    mi_os_decommit_ex(p, size, &needs_recommit, stats);
+    mi_os_decommit_ex(p, size, &needs_recommit, stat_size);
     return needs_recommit;
   }
   else {
     if (allow_reset) {  // this can sometimes be not allowed if the range is not fully committed
-      _mi_os_reset(p, size, stats);
+      _mi_os_reset(p, size);
     }
     return false;  // needs no recommit
   }
@@ -484,8 +520,8 @@ bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, mi_stats_t* stats)
 
 // either resets or decommits memory, returns true if the memory needs
 // to be recommitted if it is to be re-used later on.
-bool _mi_os_purge(void* p, size_t size, mi_stats_t * stats) {
-  return _mi_os_purge_ex(p, size, true, stats);
+bool _mi_os_purge(void* p, size_t size) {
+  return _mi_os_purge_ex(p, size, true, size);
 }
 
 // Protect a region in memory to be not accessible.
@@ -592,15 +628,15 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
       // no success, issue a warning and break
       if (p != NULL) {
         _mi_warning_message("could not allocate contiguous huge OS page %zu at %p\n", page, addr);
-        mi_os_prim_free(p, MI_HUGE_OS_PAGE_SIZE, true, &_mi_stats_main);
+        mi_os_prim_free(p, MI_HUGE_OS_PAGE_SIZE, MI_HUGE_OS_PAGE_SIZE);
       }
       break;
     }
 
     // success, record it
     page++;  // increase before timeout check (see issue #711)
-    _mi_stat_increase(&_mi_stats_main.committed, MI_HUGE_OS_PAGE_SIZE);
-    _mi_stat_increase(&_mi_stats_main.reserved, MI_HUGE_OS_PAGE_SIZE);
+    mi_os_stat_increase(committed, MI_HUGE_OS_PAGE_SIZE);
+    mi_os_stat_increase(reserved, MI_HUGE_OS_PAGE_SIZE);
 
     // check for timeout
     if (max_msecs > 0) {
@@ -634,11 +670,11 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
 
 // free every huge page in a range individually (as we allocated per page)
 // note: needed with VirtualAlloc but could potentially be done in one go on mmap'd systems.
-static void mi_os_free_huge_os_pages(void* p, size_t size, mi_stats_t* stats) {
+static void mi_os_free_huge_os_pages(void* p, size_t size) {
   if (p==NULL || size==0) return;
   uint8_t* base = (uint8_t*)p;
   while (size >= MI_HUGE_OS_PAGE_SIZE) {
-    mi_os_prim_free(base, MI_HUGE_OS_PAGE_SIZE, true, stats);
+    mi_os_prim_free(base, MI_HUGE_OS_PAGE_SIZE, MI_HUGE_OS_PAGE_SIZE);
     size -= MI_HUGE_OS_PAGE_SIZE;
     base += MI_HUGE_OS_PAGE_SIZE;
   }
@@ -667,8 +703,7 @@ size_t _mi_os_numa_node_count_get(void) {
   return count;
 }
 
-int _mi_os_numa_node_get(mi_os_tld_t* tld) {
-  MI_UNUSED(tld);
+int _mi_os_numa_node_get(void) {
   size_t numa_count = _mi_os_numa_node_count();
   if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0
   // never more than the node count and >= 0
diff --git a/third-party/mimalloc/src/page-queue.c b/third-party/mimalloc/src/page-queue.c
index ceea91ee4d..83b60e931b 100644
--- a/third-party/mimalloc/src/page-queue.c
+++ b/third-party/mimalloc/src/page-queue.c
@@ -264,8 +264,16 @@ static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_
   heap->page_count++;
 }
 
+static void mi_page_queue_move_to_front(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_t* page) {
+  mi_assert_internal(mi_page_heap(page) == heap);
+  mi_assert_internal(mi_page_queue_contains(queue, page));
+  if (queue->first == page) return;
+  mi_page_queue_remove(queue, page);
+  mi_page_queue_push(heap, queue, page);
+  mi_assert_internal(queue->first == page);
+}
 
-static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* from, mi_page_t* page) {
+static void mi_page_queue_enqueue_from_ex(mi_page_queue_t* to, mi_page_queue_t* from, bool enqueue_at_end, mi_page_t* page) {
   mi_assert_internal(page != NULL);
   mi_assert_expensive(mi_page_queue_contains(from, page));
   mi_assert_expensive(!mi_page_queue_contains(to, page));
@@ -278,6 +286,8 @@ static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* fro
                      (mi_page_is_large_or_huge(page) && mi_page_queue_is_full(to)));
 
   mi_heap_t* heap = mi_page_heap(page);
+
+  // delete from `from`
   if (page->prev != NULL) page->prev->next = page->next;
   if (page->next != NULL) page->next->prev = page->prev;
   if (page == from->last)  from->last = page->prev;
@@ -288,22 +298,59 @@ static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* fro
     mi_heap_queue_first_update(heap, from);
   }
 
-  page->prev = to->last;
-  page->next = NULL;
-  if (to->last != NULL) {
-    mi_assert_internal(heap == mi_page_heap(to->last));
-    to->last->next = page;
-    to->last = page;
+  // insert into `to`
+  if (enqueue_at_end) {
+    // enqueue at the end
+    page->prev = to->last;
+    page->next = NULL;
+    if (to->last != NULL) {
+      mi_assert_internal(heap == mi_page_heap(to->last));
+      to->last->next = page;
+      to->last = page;
+    }
+    else {
+      to->first = page;
+      to->last = page;
+      mi_heap_queue_first_update(heap, to);
+    }
   }
   else {
-    to->first = page;
-    to->last = page;
-    mi_heap_queue_first_update(heap, to);
+    if (to->first != NULL) {
+      // enqueue at 2nd place
+      mi_assert_internal(heap == mi_page_heap(to->first));
+      mi_page_t* next = to->first->next;
+      page->prev = to->first;
+      page->next = next;
+      to->first->next = page;
+      if (next != NULL) { 
+        next->prev = page; 
+      }
+      else {
+        to->last = page;
+      }
+    }
+    else {
+      // enqueue at the head (singleton list)
+      page->prev = NULL;
+      page->next = NULL;
+      to->first = page;
+      to->last = page;
+      mi_heap_queue_first_update(heap, to);
+    }
   }
 
   mi_page_set_in_full(page, mi_page_queue_is_full(to));
 }
 
+static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* from, mi_page_t* page) {
+  mi_page_queue_enqueue_from_ex(to, from, true /* enqueue at the end */, page);
+}
+
+static void mi_page_queue_enqueue_from_full(mi_page_queue_t* to, mi_page_queue_t* from, mi_page_t* page) {
+  // note: we could insert at the front to increase reuse, but it slows down certain benchmarks (like `alloc-test`)
+  mi_page_queue_enqueue_from_ex(to, from, true /* enqueue at the end of the `to` queue? */, page);
+}
+
 // Only called from `mi_heap_absorb`.
 size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append) {
   mi_assert_internal(mi_heap_contains_queue(heap,pq));
diff --git a/third-party/mimalloc/src/page.c b/third-party/mimalloc/src/page.c
index 871ed21514..6a559af038 100644
--- a/third-party/mimalloc/src/page.c
+++ b/third-party/mimalloc/src/page.c
@@ -276,7 +276,7 @@ static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size
   mi_assert_internal(mi_heap_contains_queue(heap, pq));
   mi_assert_internal(page_alignment > 0 || block_size > MI_MEDIUM_OBJ_SIZE_MAX || block_size == pq->block_size);
   #endif
-  mi_page_t* page = _mi_segment_page_alloc(heap, block_size, page_alignment, &heap->tld->segments, &heap->tld->os);
+  mi_page_t* page = _mi_segment_page_alloc(heap, block_size, page_alignment, &heap->tld->segments);
   if (page == NULL) {
     // this may be out-of-memory, or an abandoned page was reclaimed (and in our queue)
     return NULL;
@@ -358,7 +358,7 @@ void _mi_page_unfull(mi_page_t* page) {
   mi_page_set_in_full(page, false); // to get the right queue
   mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page);
   mi_page_set_in_full(page, true);
-  mi_page_queue_enqueue_from(pq, pqfull, page);
+  mi_page_queue_enqueue_from_full(pq, pqfull, page);
 }
 
 static void mi_page_to_full(mi_page_t* page, mi_page_queue_t* pq) {
@@ -404,6 +404,28 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
   _mi_segment_page_abandon(page,segments_tld);
 }
 
+// force abandon a page
+void _mi_page_force_abandon(mi_page_t* page) {
+  mi_heap_t* heap = mi_page_heap(page);
+  // mark page as not using delayed free
+  _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);
+
+  // ensure this page is no longer in the heap delayed free list
+  _mi_heap_delayed_free_all(heap);
+  // We can still access the page meta-info even if it is freed as we ensure 
+  // in `mi_segment_force_abandon` that the segment is not freed (yet)
+  if (page->capacity == 0) return; // it may have been freed now
+
+  // and now unlink it from the page queue and abandon (or free)
+  mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page);
+  if (mi_page_all_free(page)) {
+    _mi_page_free(page, pq, false);
+  }
+  else {
+    _mi_page_abandon(page, pq);
+  }
+}
+
 
 // Free a page with no more free blocks
 void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
@@ -451,6 +473,7 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
   // how to check this efficiently though...
   // for now, we don't retire if it is the only page left of this size class.
   mi_page_queue_t* pq = mi_page_queue_of(page);
+  #if MI_RETIRE_CYCLES > 0
   const size_t bsize = mi_page_block_size(page);
   if mi_likely( /* bsize < MI_MAX_RETIRE_SIZE && */ !mi_page_queue_is_special(pq)) {  // not full or huge queue?
     if (pq->last==page && pq->first==page) { // the only page in the queue?
@@ -466,6 +489,7 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
       return; // don't free after all
     }
   }
+  #endif
   _mi_page_free(page, pq, false);
 }
 
@@ -712,6 +736,17 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   Find pages with free blocks
 -------------------------------------------------------------*/
 
+// search for a best next page to use for at most N pages (often cut short if immediate blocks are available)
+#define MI_MAX_CANDIDATE_SEARCH  (4)
+
+// is the page not yet used up to its reserved space?
+static bool mi_page_is_expandable(const mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_internal(page->capacity <= page->reserved);
+  return (page->capacity < page->reserved);
+}
+
+
 // Find a page with free blocks of `page->block_size`.
 static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* pq, bool first_try)
 {
@@ -719,39 +754,77 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
   #if MI_STAT
   size_t count = 0;
   #endif
+  size_t candidate_count = 0;        // we reset this on the first candidate to limit the search
+  mi_page_t* page_candidate = NULL;  // a page with free space
   mi_page_t* page = pq->first;
+
   while (page != NULL)
   {
     mi_page_t* next = page->next; // remember next
     #if MI_STAT
     count++;
     #endif
+    candidate_count++;
 
-    // 0. collect freed blocks by us and other threads
+    // collect freed blocks by us and other threads
     _mi_page_free_collect(page, false);
 
-    // 1. if the page contains free blocks, we are done
-    if (mi_page_immediate_available(page)) {
-      break;  // pick this one
-    }
+  #if MI_MAX_CANDIDATE_SEARCH > 1
+    // search up to N pages for a best candidate
 
-    // 2. Try to extend
-    if (page->capacity < page->reserved) {
-      mi_page_extend_free(heap, page, heap->tld);
-      mi_assert_internal(mi_page_immediate_available(page));
-      break;
+    // is the local free list non-empty?
+    const bool immediate_available = mi_page_immediate_available(page);
+
+    // if the page is completely full, move it to the `mi_pages_full`
+    // queue so we don't visit long-lived pages too often.
+    if (!immediate_available && !mi_page_is_expandable(page)) {
+      mi_assert_internal(!mi_page_is_in_full(page) && !mi_page_immediate_available(page));
+      mi_page_to_full(page, pq);
+    }
+    else {
+      // the page has free space, make it a candidate
+      // we prefer non-expandable pages with high usage as candidates (to reduce commit, and increase chances of free-ing up pages)
+      if (page_candidate == NULL) {
+        page_candidate = page;
+        candidate_count = 0;
+      }
+      // prefer to reuse fuller pages (in the hope the less used page gets freed)
+      else if (page->used >= page_candidate->used && !mi_page_is_mostly_used(page) && !mi_page_is_expandable(page)) {
+        page_candidate = page;
+      }
+      // if we find a non-expandable candidate, or searched for N pages, return with the best candidate
+      if (immediate_available || candidate_count > MI_MAX_CANDIDATE_SEARCH) {
+        mi_assert_internal(page_candidate!=NULL);
+        break;
+      }
+    }
+  #else
+    // first-fit algorithm
+    // If the page contains free blocks, we are done
+    if (mi_page_immediate_available(page) || mi_page_is_expandable(page)) {
+      break;  // pick this one
     }
 
-    // 3. If the page is completely full, move it to the `mi_pages_full`
+    // If the page is completely full, move it to the `mi_pages_full`
     // queue so we don't visit long-lived pages too often.
     mi_assert_internal(!mi_page_is_in_full(page) && !mi_page_immediate_available(page));
     mi_page_to_full(page, pq);
+  #endif
 
     page = next;
   } // for each page
 
   mi_heap_stat_counter_increase(heap, searches, count);
 
+  // set the page to the best candidate
+  if (page_candidate != NULL) {
+    page = page_candidate;
+  }
+  if (page != NULL && !mi_page_immediate_available(page)) {
+    mi_assert_internal(mi_page_is_expandable(page));
+    mi_page_extend_free(heap, page, heap->tld);
+  }
+
   if (page == NULL) {
     _mi_heap_collect_retired(heap, false); // perhaps make a page available?
     page = mi_page_fresh(heap, pq);
@@ -761,10 +834,14 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
     }
   }
   else {
-    mi_assert(pq->first == page);
+    // move the page to the front of the queue
+    mi_page_queue_move_to_front(heap, pq, page);
     page->retire_expire = 0;
+    // _mi_heap_collect_retired(heap, false); // update retire counts; note: increases rss on MemoryLoad bench so don't do this
   }
   mi_assert_internal(page == NULL || mi_page_immediate_available(page));
+
+
   return page;
 }
 
@@ -772,7 +849,9 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
 
 // Find a page with free blocks of `size`.
 static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, size_t size) {
-  mi_page_queue_t* pq = mi_page_queue(heap,size);
+  mi_page_queue_t* pq = mi_page_queue(heap, size);
+
+  // check the first page: we even do this with candidate search or otherwise we re-search every time
   mi_page_t* page = pq->first;
   if (page != NULL) {
    #if (MI_SECURE>=3) // in secure mode, we extend half the time to increase randomness
@@ -791,6 +870,7 @@ static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, size_t size) {
       return page; // fast path
     }
   }
+
   return mi_page_queue_find_free_ex(heap, pq, true);
 }
 
@@ -930,7 +1010,7 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al
   mi_assert_internal(mi_page_block_size(page) >= size);
 
   // and try again, this time succeeding! (i.e. this should never recurse through _mi_page_malloc)
-  if mi_unlikely(zero && page->block_size == 0) {
+  if mi_unlikely(zero && mi_page_is_huge(page)) {
     // note: we cannot call _mi_page_malloc with zeroing for huge blocks; we zero it afterwards in that case.
     void* p = _mi_page_malloc(heap, page, size);
     mi_assert_internal(p != NULL);
diff --git a/third-party/mimalloc/src/prim/emscripten/prim.c b/third-party/mimalloc/src/prim/emscripten/prim.c
index f3797c9e66..82147de799 100644
--- a/third-party/mimalloc/src/prim/emscripten/prim.c
+++ b/third-party/mimalloc/src/prim/emscripten/prim.c
@@ -71,8 +71,8 @@ int _mi_prim_free(void* addr, size_t size) {
 extern void* emmalloc_memalign(size_t alignment, size_t size);
 
 // Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
-int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
-  MI_UNUSED(try_alignment); MI_UNUSED(allow_large); MI_UNUSED(commit);
+int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
+  MI_UNUSED(try_alignment); MI_UNUSED(allow_large); MI_UNUSED(commit); MI_UNUSED(hint_addr);
   *is_large = false;
   // TODO: Track the highest address ever seen; first uses of it are zeroes.
   //       That assumes no one else uses sbrk but us (they could go up,
@@ -200,7 +200,7 @@ bool _mi_prim_random_buf(void* buf, size_t buf_len) {
 // Thread init/done
 //----------------------------------------------------------------
 
-#ifdef __EMSCRIPTEN_SHARED_MEMORY__
+#if defined(MI_USE_PTHREADS)
 
 // use pthread local storage keys to detect thread ending
 // (and used with MI_TLS_PTHREADS for the default heap)
diff --git a/third-party/mimalloc/src/prim/osx/alloc-override-zone.c b/third-party/mimalloc/src/prim/osx/alloc-override-zone.c
index 1515b886b2..d3af170dec 100644
--- a/third-party/mimalloc/src/prim/osx/alloc-override-zone.c
+++ b/third-party/mimalloc/src/prim/osx/alloc-override-zone.c
@@ -418,9 +418,9 @@ static inline malloc_zone_t* mi_get_default_zone(void)
 }
 
 #if defined(__clang__)
-__attribute__((constructor(0)))
+__attribute__((constructor(101))) // highest priority
 #else
-__attribute__((constructor))      // seems not supported by g++-11 on the M1
+__attribute__((constructor))      // priority level is not supported by gcc
 #endif
 __attribute__((used))
 static void _mi_macos_override_malloc(void) {
diff --git a/third-party/mimalloc/src/prim/prim.c b/third-party/mimalloc/src/prim/prim.c
index 3b7d373642..2002853f28 100644
--- a/third-party/mimalloc/src/prim/prim.c
+++ b/third-party/mimalloc/src/prim/prim.c
@@ -25,3 +25,52 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "unix/prim.c"     // mmap() (Linux, macOSX, BSD, Illumnos, Haiku, DragonFly, etc.)
 
 #endif
+
+// Generic process initialization
+#ifndef MI_PRIM_HAS_PROCESS_ATTACH
+#if defined(__GNUC__) || defined(__clang__)
+  // gcc,clang: use the constructor/destructor attribute
+  // which for both seem to run before regular constructors/destructors
+  #if defined(__clang__)
+    #define mi_attr_constructor __attribute__((constructor(101)))
+    #define mi_attr_destructor  __attribute__((destructor(101)))
+  #else
+    #define mi_attr_constructor __attribute__((constructor))
+    #define mi_attr_destructor  __attribute__((destructor))
+  #endif
+  static void mi_attr_constructor mi_process_attach(void) {
+    _mi_process_load();
+  }
+  static void mi_attr_destructor mi_process_detach(void) {
+    _mi_process_done();
+  }
+#elif defined(__cplusplus)
+  // C++: use static initialization to detect process start/end
+  // This is not guaranteed to be first/last but the best we can generally do?
+  struct mi_init_done_t {
+    mi_init_done_t() {
+      _mi_process_load();
+    }
+    ~mi_init_done_t() {
+      _mi_process_done();
+    }
+  };
+  static mi_init_done_t mi_init_done;
+ #else
+  #pragma message("define a way to call _mi_process_load/done on your platform")
+#endif
+#endif
+
+// Generic allocator init/done callback 
+#ifndef MI_PRIM_HAS_ALLOCATOR_INIT
+bool _mi_is_redirected(void) {
+  return false;
+}
+bool _mi_allocator_init(const char** message) {
+  if (message != NULL) { *message = NULL; }
+  return true;
+}
+void _mi_allocator_done(void) {
+  // nothing to do
+}
+#endif
diff --git a/third-party/mimalloc/src/prim/unix/prim.c b/third-party/mimalloc/src/prim/unix/prim.c
index 90a4aac2a1..46869c861e 100644
--- a/third-party/mimalloc/src/prim/unix/prim.c
+++ b/third-party/mimalloc/src/prim/unix/prim.c
@@ -22,19 +22,18 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
-#include "mimalloc/atomic.h"
 #include "mimalloc/prim.h"
 
 #include <sys/mman.h>  // mmap
 #include <unistd.h>    // sysconf
 #include <fcntl.h>     // open, close, read, access
-#include <stdlib.h>
+#include <stdlib.h>    // getenv, arc4random_buf
 
 #if defined(__linux__)
   #include <features.h>
-  #if defined(MI_NO_THP)
-  #include <sys/prctl.h>
-  #endif
+  //#if defined(MI_NO_THP)
+  #include <sys/prctl.h>  // THP disable
+  //#endif
   #if defined(__GLIBC__)
   #include <linux/mman.h> // linux mmap flags
   #else
@@ -141,6 +140,12 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
   if (psize > 0) {
     config->page_size = (size_t)psize;
     config->alloc_granularity = (size_t)psize;
+    #if defined(_SC_PHYS_PAGES)
+    long pphys = sysconf(_SC_PHYS_PAGES);
+    if (pphys > 0 && (size_t)pphys < (SIZE_MAX/(size_t)psize)) {
+      config->physical_memory = (size_t)pphys * (size_t)psize;
+    }
+    #endif
   }
   config->large_page_size = 2*MI_MiB; // TODO: can we query the OS for this?
   config->has_overcommit = unix_detect_overcommit();
@@ -183,10 +188,11 @@ int _mi_prim_free(void* addr, size_t size ) {
 
 static int unix_madvise(void* addr, size_t size, int advice) {
   #if defined(__sun)
-  return madvise((caddr_t)addr, size, advice);  // Solaris needs cast (issue #520)
+  int res = madvise((caddr_t)addr, size, advice);  // Solaris needs cast (issue #520)
   #else
-  return madvise(addr, size, advice);
+  int res = madvise(addr, size, advice);
   #endif
+  return (res==0 ? 0 : errno);
 }
 
 static void* unix_mmap_prim(void* addr, size_t size, size_t try_alignment, int protect_flags, int flags, int fd) {
@@ -242,7 +248,7 @@ static int unix_mmap_fd(void) {
   #if defined(VM_MAKE_TAG)
   // macOS: tracking anonymous page with a specific ID. (All up to 98 are taken officially but LLVM sanitizers had taken 99)
   int os_tag = (int)mi_option_get(mi_option_os_tag);
-  if (os_tag < 100 || os_tag > 255) { os_tag = 100; }
+  if (os_tag < 100 || os_tag > 255) { os_tag = 254; }
   return VM_MAKE_TAG(os_tag);
   #else
   return -1;
@@ -266,7 +272,7 @@ static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protec
   protect_flags |= PROT_MAX(PROT_READ | PROT_WRITE); // BSD
   #endif
   // huge page allocation
-  if ((large_only || _mi_os_use_large_page(size, try_alignment)) && allow_large) {
+  if (allow_large && (large_only || (_mi_os_use_large_page(size, try_alignment) && mi_option_get(mi_option_allow_large_os_pages) == 1))) {
     static _Atomic(size_t) large_page_try_ok; // = 0;
     size_t try_ok = mi_atomic_load_acquire(&large_page_try_ok);
     if (!large_only && try_ok > 0) {
@@ -287,7 +293,7 @@ static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protec
       #endif
       #ifdef MAP_HUGE_1GB
       static bool mi_huge_pages_available = true;
-      if ((size % MI_GiB) == 0 && mi_huge_pages_available) {
+      if (large_only && (size % MI_GiB) == 0 && mi_huge_pages_available) {
         lflags |= MAP_HUGE_1GB;
       }
       else
@@ -307,7 +313,9 @@ static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protec
         #ifdef MAP_HUGE_1GB
         if (p == NULL && (lflags & MAP_HUGE_1GB) == MAP_HUGE_1GB) {
           mi_huge_pages_available = false; // don't try huge 1GiB pages again
-          _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (errno: %i)\n", errno);
+          if (large_only) {
+            _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (errno: %i)\n", errno);
+          }
           lflags = ((lflags & ~MAP_HUGE_1GB) | MAP_HUGE_2MB);
           p = unix_mmap_prim(addr, size, try_alignment, protect_flags, lflags, lfd);
         }
@@ -333,7 +341,7 @@ static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protec
       // when large OS pages are enabled for mimalloc, we call `madvise` anyways.
       if (allow_large && _mi_os_use_large_page(size, try_alignment)) {
         if (unix_madvise(p, size, MADV_HUGEPAGE) == 0) {
-          *is_large = true; // possibly
+          // *is_large = true; // possibly
         };
       }
       #elif defined(__sun)
@@ -342,7 +350,7 @@ static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protec
         cmd.mha_pagesize = _mi_os_large_page_size();
         cmd.mha_cmd = MHA_MAPSIZE_VA;
         if (memcntl((caddr_t)p, size, MC_HAT_ADVISE, (caddr_t)&cmd, 0, 0) == 0) {
-          *is_large = true;
+          // *is_large = true; // possibly
         }
       }
       #endif
@@ -352,14 +360,14 @@ static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protec
 }
 
 // Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
-int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
+int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
   mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
   mi_assert_internal(commit || !allow_large);
   mi_assert_internal(try_alignment > 0);
 
   *is_zero = true;
   int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE);
-  *addr = unix_mmap(NULL, size, try_alignment, protect_flags, false, allow_large, is_large);
+  *addr = unix_mmap(hint_addr, size, try_alignment, protect_flags, false, allow_large, is_large);
   return (*addr != NULL ? 0 : errno);
 }
 
@@ -761,7 +769,7 @@ bool _mi_prim_getenv(const char* name, char* result, size_t result_size) {
 #include <CommonCrypto/CommonRandom.h>
 
 bool _mi_prim_random_buf(void* buf, size_t buf_len) {
-  // We prefere CCRandomGenerateBytes as it returns an error code while arc4random_buf
+  // We prefer CCRandomGenerateBytes as it returns an error code while arc4random_buf
   // may fail silently on macOS. See PR #390, and <https://opensource.apple.com/source/Libc/Libc-1439.40.11/gen/FreeBSD/arc4random.c.auto.html>
   return (CCRandomGenerateBytes(buf, buf_len) == kCCSuccess);
 }
diff --git a/third-party/mimalloc/src/prim/wasi/prim.c b/third-party/mimalloc/src/prim/wasi/prim.c
index e95f67f587..e1e7de5efd 100644
--- a/third-party/mimalloc/src/prim/wasi/prim.c
+++ b/third-party/mimalloc/src/prim/wasi/prim.c
@@ -9,7 +9,6 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
-#include "mimalloc/atomic.h"
 #include "mimalloc/prim.h"
 
 #include <stdio.h>   // fputs
@@ -22,7 +21,7 @@ terms of the MIT license. A copy of the license can be found in the file
 void _mi_prim_mem_init( mi_os_mem_config_t* config ) {
   config->page_size = 64*MI_KiB; // WebAssembly has a fixed page size: 64KiB
   config->alloc_granularity = 16;
-  config->has_overcommit = false;  
+  config->has_overcommit = false;
   config->has_partial_free = false;
   config->has_virtual_reserve = false;
 }
@@ -120,8 +119,8 @@ static void* mi_prim_mem_grow(size_t size, size_t try_alignment) {
 }
 
 // Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
-int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
-  MI_UNUSED(allow_large); MI_UNUSED(commit);
+int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
+  MI_UNUSED(allow_large); MI_UNUSED(commit); MI_UNUSED(hint_addr);
   *is_large = false;
   *is_zero = false;
   *addr = mi_prim_mem_grow(size, try_alignment);
@@ -134,7 +133,7 @@ int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_la
 //---------------------------------------------
 
 int _mi_prim_commit(void* addr, size_t size, bool* is_zero) {
-  MI_UNUSED(addr); MI_UNUSED(size); 
+  MI_UNUSED(addr); MI_UNUSED(size);
   *is_zero = false;
   return 0;
 }
@@ -199,9 +198,9 @@ mi_msecs_t _mi_prim_clock_now(void) {
 // low resolution timer
 mi_msecs_t _mi_prim_clock_now(void) {
   #if !defined(CLOCKS_PER_SEC) || (CLOCKS_PER_SEC == 1000) || (CLOCKS_PER_SEC == 0)
-  return (mi_msecs_t)clock();  
+  return (mi_msecs_t)clock();
   #elif (CLOCKS_PER_SEC < 1000)
-  return (mi_msecs_t)clock() * (1000 / (mi_msecs_t)CLOCKS_PER_SEC);  
+  return (mi_msecs_t)clock() * (1000 / (mi_msecs_t)CLOCKS_PER_SEC);
   #else
   return (mi_msecs_t)clock() / ((mi_msecs_t)CLOCKS_PER_SEC / 1000);
   #endif
diff --git a/third-party/mimalloc/src/prim/windows/prim.c b/third-party/mimalloc/src/prim/windows/prim.c
index 5074ad4cbd..9686fe0736 100644
--- a/third-party/mimalloc/src/prim/windows/prim.c
+++ b/third-party/mimalloc/src/prim/windows/prim.c
@@ -9,7 +9,6 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
-#include "mimalloc/atomic.h"
 #include "mimalloc/prim.h"
 #include <stdio.h>   // fputs, stderr
 
@@ -51,7 +50,7 @@ typedef NTSTATUS (__stdcall *PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*
 static PVirtualAlloc2 pVirtualAlloc2 = NULL;
 static PNtAllocateVirtualMemoryEx pNtAllocateVirtualMemoryEx = NULL;
 
-// Similarly, GetNumaProcesorNodeEx is only supported since Windows 7
+// Similarly, GetNumaProcessorNodeEx is only supported since Windows 7
 typedef struct MI_PROCESSOR_NUMBER_S { WORD Group; BYTE Number; BYTE Reserved; } MI_PROCESSOR_NUMBER;
 
 typedef VOID (__stdcall *PGetCurrentProcessorNumberEx)(MI_PROCESSOR_NUMBER* ProcNumber);
@@ -119,6 +118,18 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
   GetSystemInfo(&si);
   if (si.dwPageSize > 0) { config->page_size = si.dwPageSize; }
   if (si.dwAllocationGranularity > 0) { config->alloc_granularity = si.dwAllocationGranularity; }
+  // get virtual address bits
+  if ((uintptr_t)si.lpMaximumApplicationAddress > 0) {
+    const size_t vbits = MI_INTPTR_BITS - mi_clz((uintptr_t)si.lpMaximumApplicationAddress);
+    config->virtual_address_bits = vbits;
+  }
+  // get physical memory
+  ULONGLONG memInKiB = 0;
+  if (GetPhysicallyInstalledSystemMemory(&memInKiB)) {
+    if (memInKiB > 0 && memInKiB < (SIZE_MAX / MI_KiB)) {
+      config->physical_memory = memInKiB * MI_KiB;
+    }
+  }
   // get the VirtualAlloc2 function
   HINSTANCE  hDll;
   hDll = LoadLibrary(TEXT("kernelbase.dll"));
@@ -162,7 +173,7 @@ int _mi_prim_free(void* addr, size_t size ) {
     // In mi_os_mem_alloc_aligned the fallback path may have returned a pointer inside
     // the memory region returned by VirtualAlloc; in that case we need to free using
     // the start of the region.
-    MEMORY_BASIC_INFORMATION info = { 0 };
+    MEMORY_BASIC_INFORMATION info; _mi_memzero_var(info);
     VirtualQuery(addr, &info, sizeof(info));
     if (info.AllocationBase < addr && ((uint8_t*)addr - (uint8_t*)info.AllocationBase) < (ptrdiff_t)MI_SEGMENT_SIZE) {
       errcode = 0;
@@ -192,7 +203,7 @@ static void* win_virtual_alloc_prim_once(void* addr, size_t size, size_t try_ali
   }
   #endif
   // on modern Windows try use VirtualAlloc2 for aligned allocation
-  if (try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
+  if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
     MI_MEM_ADDRESS_REQUIREMENTS reqs = { 0, 0, 0 };
     reqs.Alignment = try_alignment;
     MI_MEM_EXTENDED_PARAMETER param = { {0, 0}, {0} };
@@ -231,7 +242,7 @@ static void* win_virtual_alloc_prim(void* addr, size_t size, size_t try_alignmen
     else if (max_retry_msecs > 0 && (try_alignment <= 2*MI_SEGMENT_ALIGN) &&
               (flags&MEM_COMMIT) != 0 && (flags&MEM_LARGE_PAGES) == 0 &&
               win_is_out_of_memory_error(GetLastError())) {
-      // if committing regular memory and being out-of-memory, 
+      // if committing regular memory and being out-of-memory,
       // keep trying for a bit in case memory frees up after all. See issue #894
       _mi_warning_message("out-of-memory on OS allocation, try again... (attempt %lu, 0x%zx bytes, error code: 0x%x, address: %p, alignment: 0x%zx, flags: 0x%x)\n", tries, size, GetLastError(), addr, try_alignment, flags);
       long sleep_msecs = tries*40;  // increasing waits
@@ -280,14 +291,14 @@ static void* win_virtual_alloc(void* addr, size_t size, size_t try_alignment, DW
   return p;
 }
 
-int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
+int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
   mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
   mi_assert_internal(commit || !allow_large);
   mi_assert_internal(try_alignment > 0);
   *is_zero = true;
   int flags = MEM_RESERVE;
   if (commit) { flags |= MEM_COMMIT; }
-  *addr = win_virtual_alloc(NULL, size, try_alignment, flags, false, allow_large, is_large);
+  *addr = win_virtual_alloc(hint_addr, size, try_alignment, flags, false, allow_large, is_large);
   return (*addr != NULL ? 0 : (int)GetLastError());
 }
 
@@ -316,7 +327,7 @@ int _mi_prim_commit(void* addr, size_t size, bool* is_zero) {
   return 0;
 }
 
-int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit) {  
+int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit) {
   BOOL ok = VirtualFree(addr, size, MEM_DECOMMIT);
   *needs_recommit = true;  // for safety, assume always decommitted even in the case of an error.
   return (ok ? 0 : (int)GetLastError());
@@ -468,7 +479,6 @@ mi_msecs_t _mi_prim_clock_now(void) {
 // Process Info
 //----------------------------------------------------------------
 
-#include <windows.h>
 #include <psapi.h>
 
 static mi_msecs_t filetime_msecs(const FILETIME* ftime) {
@@ -491,7 +501,7 @@ void _mi_prim_process_info(mi_process_info_t* pinfo)
   GetProcessTimes(GetCurrentProcess(), &ct, &et, &st, &ut);
   pinfo->utime = filetime_msecs(&ut);
   pinfo->stime = filetime_msecs(&st);
-  
+
   // load psapi on demand
   if (pGetProcessMemoryInfo == NULL) {
     HINSTANCE hDll = LoadLibrary(TEXT("psapi.dll"));
@@ -501,11 +511,10 @@ void _mi_prim_process_info(mi_process_info_t* pinfo)
   }
 
   // get process info
-  PROCESS_MEMORY_COUNTERS info;
-  memset(&info, 0, sizeof(info));
+  PROCESS_MEMORY_COUNTERS info; _mi_memzero_var(info);
   if (pGetProcessMemoryInfo != NULL) {
     pGetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info));
-  } 
+  }
   pinfo->current_rss    = (size_t)info.WorkingSetSize;
   pinfo->peak_rss       = (size_t)info.PeakWorkingSetSize;
   pinfo->current_commit = (size_t)info.PagefileUsage;
@@ -517,7 +526,7 @@ void _mi_prim_process_info(mi_process_info_t* pinfo)
 // Output
 //----------------------------------------------------------------
 
-void _mi_prim_out_stderr( const char* msg ) 
+void _mi_prim_out_stderr( const char* msg )
 {
   // on windows with redirection, the C runtime cannot handle locale dependent output
   // after the main thread closes so we use direct console output.
@@ -564,7 +573,6 @@ bool _mi_prim_getenv(const char* name, char* result, size_t result_size) {
 }
 
 
-
 //----------------------------------------------------------------
 // Random
 //----------------------------------------------------------------
@@ -600,64 +608,210 @@ bool _mi_prim_random_buf(void* buf, size_t buf_len) {
     }
     if (pBCryptGenRandom == NULL) return false;
   }
-  return (pBCryptGenRandom(NULL, (PUCHAR)buf, (ULONG)buf_len, BCRYPT_USE_SYSTEM_PREFERRED_RNG) >= 0);  
+  return (pBCryptGenRandom(NULL, (PUCHAR)buf, (ULONG)buf_len, BCRYPT_USE_SYSTEM_PREFERRED_RNG) >= 0);
 }
 
 #endif  // MI_USE_RTLGENRANDOM
 
+
+
 //----------------------------------------------------------------
-// Thread init/done
+// Process & Thread Init/Done
 //----------------------------------------------------------------
 
-#if !defined(MI_SHARED_LIB)
+static void NTAPI mi_win_main(PVOID module, DWORD reason, LPVOID reserved) {
+  MI_UNUSED(reserved);
+  MI_UNUSED(module);
+  #if MI_TLS_SLOT >= 2
+  if ((reason==DLL_PROCESS_ATTACH || reason==DLL_THREAD_ATTACH) && mi_prim_get_default_heap() == NULL) {
+    _mi_heap_set_default_direct((mi_heap_t*)&_mi_heap_empty);
+  }
+  #endif
+  if (reason==DLL_PROCESS_ATTACH) {
+    _mi_process_load();
+  }
+  else if (reason==DLL_PROCESS_DETACH) {
+    _mi_process_done();
+  }
+  else if (reason==DLL_THREAD_DETACH && !_mi_is_redirected()) {
+    _mi_thread_done(NULL);
+  }    
+}
 
-// use thread local storage keys to detect thread ending
-// note: another design could be to use special linker sections (see issue #869)
-#include <fibersapi.h>
-#if (_WIN32_WINNT < 0x600)  // before Windows Vista
-WINBASEAPI DWORD WINAPI FlsAlloc( _In_opt_ PFLS_CALLBACK_FUNCTION lpCallback );
-WINBASEAPI PVOID WINAPI FlsGetValue( _In_ DWORD dwFlsIndex );
-WINBASEAPI BOOL  WINAPI FlsSetValue( _In_ DWORD dwFlsIndex, _In_opt_ PVOID lpFlsData );
-WINBASEAPI BOOL  WINAPI FlsFree(_In_ DWORD dwFlsIndex);
-#endif
 
-static DWORD mi_fls_key = (DWORD)(-1);
+#if defined(MI_SHARED_LIB)
+  #define MI_PRIM_HAS_PROCESS_ATTACH  1
 
-static void NTAPI mi_fls_done(PVOID value) {
-  mi_heap_t* heap = (mi_heap_t*)value;
-  if (heap != NULL) {
-    _mi_thread_done(heap);
-    FlsSetValue(mi_fls_key, NULL);  // prevent recursion as _mi_thread_done may set it back to the main heap, issue #672
+  // Windows DLL: easy to hook into process_init and thread_done
+  __declspec(dllexport) BOOL WINAPI DllMain(HINSTANCE inst, DWORD reason, LPVOID reserved) {
+    mi_win_main((PVOID)inst,reason,reserved);
+    return TRUE;
   }
-}
 
-void _mi_prim_thread_init_auto_done(void) {
-  mi_fls_key = FlsAlloc(&mi_fls_done);
-}
+  // nothing to do since `_mi_thread_done` is handled through the DLL_THREAD_DETACH event.
+  void _mi_prim_thread_init_auto_done(void) { }
+  void _mi_prim_thread_done_auto_done(void) { }
+  void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
+    MI_UNUSED(heap);
+  }
 
-void _mi_prim_thread_done_auto_done(void) {
-  // call thread-done on all threads (except the main thread) to prevent 
-  // dangling callback pointer if statically linked with a DLL; Issue #208
-  FlsFree(mi_fls_key);  
-}
+#elif !defined(MI_WIN_USE_FLS)
+  #define MI_PRIM_HAS_PROCESS_ATTACH  1
 
-void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
-  mi_assert_internal(mi_fls_key != (DWORD)(-1));
-  FlsSetValue(mi_fls_key, heap);
-}
+  static void NTAPI mi_win_main_attach(PVOID module, DWORD reason, LPVOID reserved) {
+    if (reason == DLL_PROCESS_ATTACH || reason == DLL_THREAD_ATTACH) {
+      mi_win_main(module, reason, reserved);
+    }
+  }
+  static void NTAPI mi_win_main_detach(PVOID module, DWORD reason, LPVOID reserved) {
+    if (reason == DLL_PROCESS_DETACH || reason == DLL_THREAD_DETACH) {
+      mi_win_main(module, reason, reserved);
+    }
+  }
 
-#else
+  // Set up TLS callbacks in a statically linked library by using special data sections.
+  // See <https://stackoverflow.com/questions/14538159/tls-callback-in-windows>
+  // We use 2 entries to ensure we call attach events before constructors
+  // are called, and detach events after destructors are called.
+  #if defined(__cplusplus)
+  extern "C" {
+  #endif
 
-// Dll; nothing to do as in that case thread_done is handled through the DLL_THREAD_DETACH event.
+  #if defined(_WIN64)
+    #pragma comment(linker, "/INCLUDE:_tls_used")
+    #pragma comment(linker, "/INCLUDE:_mi_tls_callback_pre")
+    #pragma comment(linker, "/INCLUDE:_mi_tls_callback_post")
+    #pragma const_seg(".CRT$XLB")
+    extern const PIMAGE_TLS_CALLBACK _mi_tls_callback_pre[];
+    const PIMAGE_TLS_CALLBACK _mi_tls_callback_pre[] = { &mi_win_main_attach };
+    #pragma const_seg()
+    #pragma const_seg(".CRT$XLY")
+    extern const PIMAGE_TLS_CALLBACK _mi_tls_callback_post[];
+    const PIMAGE_TLS_CALLBACK _mi_tls_callback_post[] = { &mi_win_main_detach };
+    #pragma const_seg()
+  #else
+    #pragma comment(linker, "/INCLUDE:__tls_used")
+    #pragma comment(linker, "/INCLUDE:__mi_tls_callback_pre")
+    #pragma comment(linker, "/INCLUDE:__mi_tls_callback_post")
+    #pragma data_seg(".CRT$XLB")
+    PIMAGE_TLS_CALLBACK _mi_tls_callback_pre[] = { &mi_win_main_attach };
+    #pragma data_seg()
+    #pragma data_seg(".CRT$XLY")
+    PIMAGE_TLS_CALLBACK _mi_tls_callback_post[] = { &mi_win_main_detach };
+    #pragma data_seg()
+  #endif
 
-void _mi_prim_thread_init_auto_done(void) {
-}
+  #if defined(__cplusplus)
+  }
+  #endif
 
-void _mi_prim_thread_done_auto_done(void) {
-}
+  // nothing to do since `_mi_thread_done` is handled through the DLL_THREAD_DETACH event.
+  void _mi_prim_thread_init_auto_done(void) { }
+  void _mi_prim_thread_done_auto_done(void) { }
+  void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
+    MI_UNUSED(heap);
+  }
 
-void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
-  MI_UNUSED(heap);
-}
+#else // deprecated: statically linked, use fiber api
 
+  #if defined(_MSC_VER) // on clang/gcc use the constructor attribute (in `src/prim/prim.c`)
+    // MSVC: use data section magic for static libraries
+    // See <https://www.codeguru.com/cpp/misc/misc/applicationcontrol/article.php/c6945/Running-Code-Before-and-After-Main.htm>
+    #define MI_PRIM_HAS_PROCESS_ATTACH 1
+
+    static int mi_process_attach(void) {
+      mi_win_main(NULL,DLL_PROCESS_ATTACH,NULL);
+      atexit(&_mi_process_done);
+      return 0;
+    }
+    typedef int(*mi_crt_callback_t)(void);
+    #if defined(_WIN64)
+      #pragma comment(linker, "/INCLUDE:_mi_tls_callback")
+      #pragma section(".CRT$XIU", long, read)
+    #else
+      #pragma comment(linker, "/INCLUDE:__mi_tls_callback")
+    #endif
+    #pragma data_seg(".CRT$XIU")
+    mi_decl_externc mi_crt_callback_t _mi_tls_callback[] = { &mi_process_attach };
+    #pragma data_seg()
+  #endif
+
+  // use the fiber api for calling `_mi_thread_done`.
+  #include <fibersapi.h>
+  #if (_WIN32_WINNT < 0x600)  // before Windows Vista
+  WINBASEAPI DWORD WINAPI FlsAlloc( _In_opt_ PFLS_CALLBACK_FUNCTION lpCallback );
+  WINBASEAPI PVOID WINAPI FlsGetValue( _In_ DWORD dwFlsIndex );
+  WINBASEAPI BOOL  WINAPI FlsSetValue( _In_ DWORD dwFlsIndex, _In_opt_ PVOID lpFlsData );
+  WINBASEAPI BOOL  WINAPI FlsFree(_In_ DWORD dwFlsIndex);
+  #endif
+
+  static DWORD mi_fls_key = (DWORD)(-1);
+
+  static void NTAPI mi_fls_done(PVOID value) {
+    mi_heap_t* heap = (mi_heap_t*)value;
+    if (heap != NULL) {
+      _mi_thread_done(heap);
+      FlsSetValue(mi_fls_key, NULL);  // prevent recursion as _mi_thread_done may set it back to the main heap, issue #672
+    }
+  }
+
+  void _mi_prim_thread_init_auto_done(void) {
+    mi_fls_key = FlsAlloc(&mi_fls_done);
+  }
+
+  void _mi_prim_thread_done_auto_done(void) {
+    // call thread-done on all threads (except the main thread) to prevent
+    // dangling callback pointer if statically linked with a DLL; Issue #208
+    FlsFree(mi_fls_key);
+  }
+
+  void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
+    mi_assert_internal(mi_fls_key != (DWORD)(-1));
+    FlsSetValue(mi_fls_key, heap);
+  }
+#endif
+
+// ----------------------------------------------------
+// Communicate with the redirection module on Windows
+// ----------------------------------------------------
+#if defined(MI_SHARED_LIB) && !defined(MI_WIN_NOREDIRECT)
+  #define MI_PRIM_HAS_ALLOCATOR_INIT 1
+
+  static bool mi_redirected = false;   // true if malloc redirects to mi_malloc
+
+  bool _mi_is_redirected(void) {
+    return mi_redirected;
+  }
+
+  #ifdef __cplusplus
+  extern "C" {
+  #endif
+  mi_decl_export void _mi_redirect_entry(DWORD reason) {
+    // called on redirection; careful as this may be called before DllMain
+    #if MI_TLS_SLOT >= 2
+    if ((reason==DLL_PROCESS_ATTACH || reason==DLL_THREAD_ATTACH) && mi_prim_get_default_heap() == NULL) {
+      _mi_heap_set_default_direct((mi_heap_t*)&_mi_heap_empty);
+    }
+    #endif
+    if (reason == DLL_PROCESS_ATTACH) {
+      mi_redirected = true;
+    }
+    else if (reason == DLL_PROCESS_DETACH) {
+      mi_redirected = false;
+    }
+    else if (reason == DLL_THREAD_DETACH) {
+      _mi_thread_done(NULL);
+    }
+  }
+  __declspec(dllimport) bool mi_cdecl mi_allocator_init(const char** message);
+  __declspec(dllimport) void mi_cdecl mi_allocator_done(void);
+  #ifdef __cplusplus
+  }
+  #endif
+  bool _mi_allocator_init(const char** message) {
+    return mi_allocator_init(message);
+  }
+  void _mi_allocator_done(void) {
+    mi_allocator_done();
+  }
 #endif
diff --git a/third-party/mimalloc/src/segment-map.c b/third-party/mimalloc/src/segment-map.c
index 1efb1e2360..5809342cf7 100644
--- a/third-party/mimalloc/src/segment-map.c
+++ b/third-party/mimalloc/src/segment-map.c
@@ -16,140 +16,121 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc/internal.h"
 #include "mimalloc/atomic.h"
 
-#if (MI_INTPTR_SIZE>=8) && MI_TRACK_ASAN
-#define MI_MAX_ADDRESS    ((size_t)140 << 40) // 140TB (see issue #881)
-#elif (MI_INTPTR_SIZE >= 8)
-#define MI_MAX_ADDRESS    ((size_t)40 << 40)  // 40TB (to include huge page areas)
+// Reduce total address space to reduce .bss  (due to the `mi_segment_map`)
+#if (MI_INTPTR_SIZE > 4) && MI_TRACK_ASAN
+#define MI_SEGMENT_MAP_MAX_ADDRESS    (128*1024ULL*MI_GiB)  // 128 TiB  (see issue #881)
+#elif (MI_INTPTR_SIZE > 4)
+#define MI_SEGMENT_MAP_MAX_ADDRESS    (48*1024ULL*MI_GiB)   // 48 TiB
 #else
-#define MI_MAX_ADDRESS    ((size_t)2 << 30)   // 2Gb
+#define MI_SEGMENT_MAP_MAX_ADDRESS    (UINT32_MAX)
 #endif
 
-#define MI_SEGMENT_MAP_BITS  (MI_MAX_ADDRESS / MI_SEGMENT_SIZE)
-#define MI_SEGMENT_MAP_SIZE  (MI_SEGMENT_MAP_BITS / 8)
-#define MI_SEGMENT_MAP_WSIZE (MI_SEGMENT_MAP_SIZE / MI_INTPTR_SIZE)
+#define MI_SEGMENT_MAP_PART_SIZE      (MI_INTPTR_SIZE*MI_KiB - 128)      // 128 > sizeof(mi_memid_t) ! 
+#define MI_SEGMENT_MAP_PART_BITS      (8*MI_SEGMENT_MAP_PART_SIZE)
+#define MI_SEGMENT_MAP_PART_ENTRIES   (MI_SEGMENT_MAP_PART_SIZE / MI_INTPTR_SIZE)
+#define MI_SEGMENT_MAP_PART_BIT_SPAN  (MI_SEGMENT_ALIGN)
+#define MI_SEGMENT_MAP_PART_SPAN      (MI_SEGMENT_MAP_PART_BITS * MI_SEGMENT_MAP_PART_BIT_SPAN)
+#define MI_SEGMENT_MAP_MAX_PARTS      ((MI_SEGMENT_MAP_MAX_ADDRESS / MI_SEGMENT_MAP_PART_SPAN) + 1)
 
-static _Atomic(uintptr_t) mi_segment_map[MI_SEGMENT_MAP_WSIZE + 1];  // 2KiB per TB with 64MiB segments
+// A part of the segment map.
+typedef struct mi_segmap_part_s {
+  mi_memid_t memid;
+  _Atomic(uintptr_t) map[MI_SEGMENT_MAP_PART_ENTRIES];
+} mi_segmap_part_t;
 
-static size_t mi_segment_map_index_of(const mi_segment_t* segment, size_t* bitidx) {
+// Allocate parts on-demand to reduce .bss footprint
+static _Atomic(mi_segmap_part_t*) mi_segment_map[MI_SEGMENT_MAP_MAX_PARTS]; // = { NULL, .. }
+
+static mi_segmap_part_t* mi_segment_map_index_of(const mi_segment_t* segment, bool create_on_demand, size_t* idx, size_t* bitidx) {
   // note: segment can be invalid or NULL.
   mi_assert_internal(_mi_ptr_segment(segment + 1) == segment); // is it aligned on MI_SEGMENT_SIZE?
-  if ((uintptr_t)segment >= MI_MAX_ADDRESS) {
-    *bitidx = 0;
-    return MI_SEGMENT_MAP_WSIZE;
-  }
-  else {
-    const uintptr_t segindex = ((uintptr_t)segment) / MI_SEGMENT_SIZE;
-    *bitidx = segindex % MI_INTPTR_BITS;
-    const size_t mapindex = segindex / MI_INTPTR_BITS;
-    mi_assert_internal(mapindex < MI_SEGMENT_MAP_WSIZE);
-    return mapindex;
+  *idx = 0;
+  *bitidx = 0;  
+  if ((uintptr_t)segment >= MI_SEGMENT_MAP_MAX_ADDRESS) return NULL;
+  const uintptr_t segindex = ((uintptr_t)segment) / MI_SEGMENT_MAP_PART_SPAN;
+  if (segindex >= MI_SEGMENT_MAP_MAX_PARTS) return NULL;
+  mi_segmap_part_t* part = mi_atomic_load_ptr_relaxed(mi_segmap_part_t, &mi_segment_map[segindex]);
+
+  // allocate on demand to reduce .bss footprint
+  if (part == NULL) {
+    if (!create_on_demand) return NULL;
+    mi_memid_t memid;
+    part = (mi_segmap_part_t*)_mi_os_alloc(sizeof(mi_segmap_part_t), &memid);
+    if (part == NULL) return NULL;
+    part->memid = memid;
+    mi_segmap_part_t* expected = NULL;
+    if (!mi_atomic_cas_ptr_strong_release(mi_segmap_part_t, &mi_segment_map[segindex], &expected, part)) {
+      _mi_os_free(part, sizeof(mi_segmap_part_t), memid);
+      part = expected;
+      if (part == NULL) return NULL;
+    }
   }
+  mi_assert(part != NULL);
+  const uintptr_t offset = ((uintptr_t)segment) % MI_SEGMENT_MAP_PART_SPAN;
+  const uintptr_t bitofs = offset / MI_SEGMENT_MAP_PART_BIT_SPAN;
+  *idx = bitofs / MI_INTPTR_BITS;
+  *bitidx = bitofs % MI_INTPTR_BITS;
+  return part;
 }
 
 void _mi_segment_map_allocated_at(const mi_segment_t* segment) {
+  if (segment->memid.memkind == MI_MEM_ARENA) return; // we lookup segments first in the arena's and don't need the segment map
+  size_t index;
   size_t bitidx;
-  size_t index = mi_segment_map_index_of(segment, &bitidx);
-  mi_assert_internal(index <= MI_SEGMENT_MAP_WSIZE);
-  if (index==MI_SEGMENT_MAP_WSIZE) return;
-  uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]);
+  mi_segmap_part_t* part = mi_segment_map_index_of(segment, true /* alloc map if needed */, &index, &bitidx);
+  if (part == NULL) return; // outside our address range..
+  uintptr_t mask = mi_atomic_load_relaxed(&part->map[index]);
   uintptr_t newmask;
   do {
     newmask = (mask | ((uintptr_t)1 << bitidx));
-  } while (!mi_atomic_cas_weak_release(&mi_segment_map[index], &mask, newmask));
+  } while (!mi_atomic_cas_weak_release(&part->map[index], &mask, newmask));
 }
 
 void _mi_segment_map_freed_at(const mi_segment_t* segment) {
+  if (segment->memid.memkind == MI_MEM_ARENA) return;
+  size_t index;
   size_t bitidx;
-  size_t index = mi_segment_map_index_of(segment, &bitidx);
-  mi_assert_internal(index <= MI_SEGMENT_MAP_WSIZE);
-  if (index == MI_SEGMENT_MAP_WSIZE) return;
-  uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]);
+  mi_segmap_part_t* part = mi_segment_map_index_of(segment, false /* don't alloc if not present */, &index, &bitidx);
+  if (part == NULL) return; // outside our address range..
+  uintptr_t mask = mi_atomic_load_relaxed(&part->map[index]);
   uintptr_t newmask;
   do {
     newmask = (mask & ~((uintptr_t)1 << bitidx));
-  } while (!mi_atomic_cas_weak_release(&mi_segment_map[index], &mask, newmask));
+  } while (!mi_atomic_cas_weak_release(&part->map[index], &mask, newmask));
 }
 
 // Determine the segment belonging to a pointer or NULL if it is not in a valid segment.
 static mi_segment_t* _mi_segment_of(const void* p) {
   if (p == NULL) return NULL;
   mi_segment_t* segment = _mi_ptr_segment(p);  // segment can be NULL  
+  size_t index;
   size_t bitidx;
-  size_t index = mi_segment_map_index_of(segment, &bitidx);
-  // fast path: for any pointer to valid small/medium/large object or first MI_SEGMENT_SIZE in huge
-  const uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]);
+  mi_segmap_part_t* part = mi_segment_map_index_of(segment, false /* dont alloc if not present */, &index, &bitidx);
+  if (part == NULL) return NULL;  
+  const uintptr_t mask = mi_atomic_load_relaxed(&part->map[index]);
   if mi_likely((mask & ((uintptr_t)1 << bitidx)) != 0) {
+    bool cookie_ok = (_mi_ptr_cookie(segment) == segment->cookie);
+    mi_assert_internal(cookie_ok); MI_UNUSED(cookie_ok);
     return segment; // yes, allocated by us
   }
-  if (index==MI_SEGMENT_MAP_WSIZE) return NULL;
-
-  // TODO: maintain max/min allocated range for efficiency for more efficient rejection of invalid pointers?
-
-  // search downwards for the first segment in case it is an interior pointer
-  // could be slow but searches in MI_INTPTR_SIZE * MI_SEGMENT_SIZE (512MiB) steps trough
-  // valid huge objects
-  // note: we could maintain a lowest index to speed up the path for invalid pointers?
-  size_t lobitidx;
-  size_t loindex;
-  uintptr_t lobits = mask & (((uintptr_t)1 << bitidx) - 1);
-  if (lobits != 0) {
-    loindex = index;
-    lobitidx = mi_bsr(lobits);    // lobits != 0
-  }
-  else if (index == 0) {
-    return NULL;
-  }
-  else {
-    mi_assert_internal(index > 0);
-    uintptr_t lomask = mask;
-    loindex = index;
-    do {
-      loindex--;  
-      lomask = mi_atomic_load_relaxed(&mi_segment_map[loindex]);      
-    } while (lomask != 0 && loindex > 0);
-    if (lomask == 0) return NULL;
-    lobitidx = mi_bsr(lomask);    // lomask != 0
-  }
-  mi_assert_internal(loindex < MI_SEGMENT_MAP_WSIZE);
-  // take difference as the addresses could be larger than the MAX_ADDRESS space.
-  size_t diff = (((index - loindex) * (8*MI_INTPTR_SIZE)) + bitidx - lobitidx) * MI_SEGMENT_SIZE;
-  segment = (mi_segment_t*)((uint8_t*)segment - diff);
-
-  if (segment == NULL) return NULL;
-  mi_assert_internal((void*)segment < p);
-  bool cookie_ok = (_mi_ptr_cookie(segment) == segment->cookie);
-  mi_assert_internal(cookie_ok);
-  if mi_unlikely(!cookie_ok) return NULL;
-  if (((uint8_t*)segment + mi_segment_size(segment)) <= (uint8_t*)p) return NULL; // outside the range
-  mi_assert_internal(p >= (void*)segment && (uint8_t*)p < (uint8_t*)segment + mi_segment_size(segment));
-  return segment;
+  return NULL;
 }
 
 // Is this a valid pointer in our heap?
-static bool  mi_is_valid_pointer(const void* p) {
-  return ((_mi_segment_of(p) != NULL) || (_mi_arena_contains(p)));
+static bool mi_is_valid_pointer(const void* p) {
+  // first check if it is in an arena, then check if it is OS allocated
+  return (_mi_arena_contains(p) || _mi_segment_of(p) != NULL);
 }
 
 mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
   return mi_is_valid_pointer(p);
 }
 
-/*
-// Return the full segment range belonging to a pointer
-static void* mi_segment_range_of(const void* p, size_t* size) {
-  mi_segment_t* segment = _mi_segment_of(p);
-  if (segment == NULL) {
-    if (size != NULL) *size = 0;
-    return NULL;
-  }
-  else {
-    if (size != NULL) *size = segment->segment_size;
-    return segment;
+void _mi_segment_map_unsafe_destroy(void) {
+  for (size_t i = 0; i < MI_SEGMENT_MAP_MAX_PARTS; i++) {
+    mi_segmap_part_t* part = mi_atomic_exchange_ptr_relaxed(mi_segmap_part_t, &mi_segment_map[i], NULL);
+    if (part != NULL) {
+      _mi_os_free(part, sizeof(mi_segmap_part_t), part->memid);
+    }
   }
-  mi_assert_expensive(page == NULL || mi_segment_is_valid(_mi_page_segment(page),tld));
-  mi_assert_internal(page == NULL || (mi_segment_page_size(_mi_page_segment(page)) - (MI_SECURE == 0 ? 0 : _mi_os_page_size())) >= block_size);
-  mi_reset_delayed(tld);
-  mi_assert_internal(page == NULL || mi_page_not_in_queue(page, tld));
-  return page;
 }
-*/
diff --git a/third-party/mimalloc/src/segment.c b/third-party/mimalloc/src/segment.c
index 4e4dcb80ee..2c0b9ec31d 100644
--- a/third-party/mimalloc/src/segment.c
+++ b/third-party/mimalloc/src/segment.c
@@ -17,7 +17,7 @@ terms of the MIT license. A copy of the license can be found in the file
 // -------------------------------------------------------------------
 
 
-static void mi_segment_try_purge(mi_segment_t* segment, bool force, mi_stats_t* stats);
+static void mi_segment_try_purge(mi_segment_t* segment, bool force);
 
 
 // -------------------------------------------------------------------
@@ -150,6 +150,23 @@ size_t _mi_commit_mask_next_run(const mi_commit_mask_t* cm, size_t* idx) {
 
 /* --------------------------------------------------------------------------------
   Segment allocation
+  We allocate pages inside bigger "segments" (32 MiB on 64-bit). This is to avoid
+  splitting VMA's on Linux and reduce fragmentation on other OS's.
+  Each thread owns its own segments.
+
+  Currently we have:
+  - small pages (64KiB) 
+  - medium pages (512KiB)
+  - large pages (4MiB),
+  - huge segments have 1 page in one segment that can be larger than `MI_SEGMENT_SIZE`.
+    it is used for blocks `> MI_LARGE_OBJ_SIZE_MAX` or with alignment `> MI_BLOCK_ALIGNMENT_MAX`.
+
+  The memory for a segment is usually committed on demand.
+  (i.e. we are careful to not touch the memory until we actually allocate a block there)
+
+  If a  thread ends, it "abandons" pages that still contain live blocks.
+  Such segments are abandoned and these can be reclaimed by still running threads,
+  (much like work-stealing).
 -------------------------------------------------------------------------------- */
 
 
@@ -332,6 +349,9 @@ static uint8_t* _mi_segment_page_start_from_slice(const mi_segment_t* segment, c
     if (block_size <= 64) { start_offset += 3*block_size; }
     else if (block_size <= 512) { start_offset += block_size; }
   }
+  start_offset = _mi_align_up(start_offset, MI_MAX_ALIGN_SIZE);
+  mi_assert_internal(_mi_is_aligned(pstart + start_offset, MI_MAX_ALIGN_SIZE));
+  mi_assert_internal(block_size == 0 || block_size > MI_MAX_ALIGN_GUARANTEE || _mi_is_aligned(pstart + start_offset,block_size));
   if (page_size != NULL) { *page_size = psize - start_offset; }
   return (pstart + start_offset);
 }
@@ -407,8 +427,7 @@ static void mi_segment_os_free(mi_segment_t* segment, mi_segments_tld_t* tld) {
   const size_t size = mi_segment_size(segment);
   const size_t csize = _mi_commit_mask_committed_size(&segment->commit_mask, size);
 
-  _mi_abandoned_await_readers();  // wait until safe to free
-  _mi_arena_free(segment, mi_segment_size(segment), csize, segment->memid, tld->stats);
+  _mi_arena_free(segment, mi_segment_size(segment), csize, segment->memid);
 }
 
 /* -----------------------------------------------------------
@@ -465,7 +484,7 @@ static void mi_segment_commit_mask(mi_segment_t* segment, bool conservative, uin
   mi_commit_mask_create(bitidx, bitcount, cm);
 }
 
-static bool mi_segment_commit(mi_segment_t* segment, uint8_t* p, size_t size, mi_stats_t* stats) {
+static bool mi_segment_commit(mi_segment_t* segment, uint8_t* p, size_t size) {
   mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->purge_mask));
 
   // commit liberal
@@ -481,7 +500,7 @@ static bool mi_segment_commit(mi_segment_t* segment, uint8_t* p, size_t size, mi
     mi_commit_mask_t cmask;
     mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask);
     _mi_stat_decrease(&_mi_stats_main.committed, _mi_commit_mask_committed_size(&cmask, MI_SEGMENT_SIZE)); // adjust for overlap
-    if (!_mi_os_commit(start, full_size, &is_zero, stats)) return false;
+    if (!_mi_os_commit(start, full_size, &is_zero)) return false;
     mi_commit_mask_set(&segment->commit_mask, &mask);
   }
 
@@ -495,15 +514,15 @@ static bool mi_segment_commit(mi_segment_t* segment, uint8_t* p, size_t size, mi
   return true;
 }
 
-static bool mi_segment_ensure_committed(mi_segment_t* segment, uint8_t* p, size_t size, mi_stats_t* stats) {
+static bool mi_segment_ensure_committed(mi_segment_t* segment, uint8_t* p, size_t size) {
   mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->purge_mask));
   // note: assumes commit_mask is always full for huge segments as otherwise the commit mask bits can overflow
   if (mi_commit_mask_is_full(&segment->commit_mask) && mi_commit_mask_is_empty(&segment->purge_mask)) return true; // fully committed
   mi_assert_internal(segment->kind != MI_SEGMENT_HUGE);
-  return mi_segment_commit(segment, p, size, stats);
+  return mi_segment_commit(segment, p, size);
 }
 
-static bool mi_segment_purge(mi_segment_t* segment, uint8_t* p, size_t size, mi_stats_t* stats) {
+static bool mi_segment_purge(mi_segment_t* segment, uint8_t* p, size_t size) {
   mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->purge_mask));
   if (!segment->allow_purge) return true;
 
@@ -518,7 +537,7 @@ static bool mi_segment_purge(mi_segment_t* segment, uint8_t* p, size_t size, mi_
     // purging
     mi_assert_internal((void*)start != (void*)segment);
     mi_assert_internal(segment->allow_decommit);
-    const bool decommitted = _mi_os_purge(start, full_size, stats);  // reset or decommit
+    const bool decommitted = _mi_os_purge(start, full_size);  // reset or decommit
     if (decommitted) {
       mi_commit_mask_t cmask;
       mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask);
@@ -532,11 +551,11 @@ static bool mi_segment_purge(mi_segment_t* segment, uint8_t* p, size_t size, mi_
   return true;
 }
 
-static void mi_segment_schedule_purge(mi_segment_t* segment, uint8_t* p, size_t size, mi_stats_t* stats) {
+static void mi_segment_schedule_purge(mi_segment_t* segment, uint8_t* p, size_t size) {
   if (!segment->allow_purge) return;
 
   if (mi_option_get(mi_option_purge_delay) == 0) {
-    mi_segment_purge(segment, p, size, stats);
+    mi_segment_purge(segment, p, size);
   }
   else {
     // register for future purge in the purge mask
@@ -559,7 +578,7 @@ static void mi_segment_schedule_purge(mi_segment_t* segment, uint8_t* p, size_t
     else if (segment->purge_expire <= now) {
       // previous purge mask already expired
       if (segment->purge_expire + mi_option_get(mi_option_purge_extend_delay) <= now) {
-        mi_segment_try_purge(segment, true, stats);
+        mi_segment_try_purge(segment, true);
       }
       else {
         segment->purge_expire = now + mi_option_get(mi_option_purge_extend_delay); // (mi_option_get(mi_option_purge_delay) / 8); // wait a tiny bit longer in case there is a series of free's
@@ -572,7 +591,7 @@ static void mi_segment_schedule_purge(mi_segment_t* segment, uint8_t* p, size_t
   }
 }
 
-static void mi_segment_try_purge(mi_segment_t* segment, bool force, mi_stats_t* stats) {
+static void mi_segment_try_purge(mi_segment_t* segment, bool force) {
   if (!segment->allow_purge || segment->purge_expire == 0 || mi_commit_mask_is_empty(&segment->purge_mask)) return;
   mi_msecs_t now = _mi_clock_now();
   if (!force && now < segment->purge_expire) return;
@@ -588,7 +607,7 @@ static void mi_segment_try_purge(mi_segment_t* segment, bool force, mi_stats_t*
     if (count > 0) {
       uint8_t* p = (uint8_t*)segment + (idx*MI_COMMIT_SIZE);
       size_t size = count * MI_COMMIT_SIZE;
-      mi_segment_purge(segment, p, size, stats);
+      mi_segment_purge(segment, p, size);
     }
   }
   mi_commit_mask_foreach_end()
@@ -597,8 +616,8 @@ static void mi_segment_try_purge(mi_segment_t* segment, bool force, mi_stats_t*
 
 // called from `mi_heap_collect_ex`
 // this can be called per-page so it is important that try_purge has fast exit path
-void _mi_segment_collect(mi_segment_t* segment, bool force, mi_segments_tld_t* tld) {
-  mi_segment_try_purge(segment, force, tld->stats);
+void _mi_segment_collect(mi_segment_t* segment, bool force) {
+  mi_segment_try_purge(segment, force);
 }
 
 /* -----------------------------------------------------------
@@ -633,7 +652,7 @@ static void mi_segment_span_free(mi_segment_t* segment, size_t slice_index, size
 
   // perhaps decommit
   if (allow_purge) {
-    mi_segment_schedule_purge(segment, mi_slice_start(slice), slice_count * MI_SEGMENT_SLICE_SIZE, tld->stats);
+    mi_segment_schedule_purge(segment, mi_slice_start(slice), slice_count * MI_SEGMENT_SLICE_SIZE);
   }
 
   // and push it on the free page queue (if it was not a huge page)
@@ -662,7 +681,6 @@ static void mi_segment_span_remove_from_queue(mi_slice_t* slice, mi_segments_tld
 static mi_slice_t* mi_segment_span_free_coalesce(mi_slice_t* slice, mi_segments_tld_t* tld) {
   mi_assert_internal(slice != NULL && slice->slice_count > 0 && slice->slice_offset == 0);
   mi_segment_t* const segment = _mi_ptr_segment(slice);
-  const bool is_abandoned = (segment->thread_id == 0); // mi_segment_is_abandoned(segment);
 
   // for huge pages, just mark as free but don't add to the queues
   if (segment->kind == MI_SEGMENT_HUGE) {
@@ -675,6 +693,7 @@ static mi_slice_t* mi_segment_span_free_coalesce(mi_slice_t* slice, mi_segments_
   }
 
   // otherwise coalesce the span and add to the free span queues
+  const bool is_abandoned = (segment->thread_id == 0); // mi_segment_is_abandoned(segment);
   size_t slice_count = slice->slice_count;
   mi_slice_t* next = slice + slice->slice_count;
   mi_assert_internal(next <= mi_segment_slices_end(segment));
@@ -691,6 +710,8 @@ static mi_slice_t* mi_segment_span_free_coalesce(mi_slice_t* slice, mi_segments_
       // free previous slice -- remove it from free and merge
       mi_assert_internal(prev->slice_count > 0 && prev->slice_offset==0);
       slice_count += prev->slice_count;
+      slice->slice_count = 0;
+      slice->slice_offset = (uint32_t)((uint8_t*)slice - (uint8_t*)prev); // set the slice offset for `segment_force_abandon` (in case the previous free block is very large).
       if (!is_abandoned) { mi_segment_span_remove_from_queue(prev, tld); }
       slice = prev;
     }
@@ -708,13 +729,13 @@ static mi_slice_t* mi_segment_span_free_coalesce(mi_slice_t* slice, mi_segments_
 ----------------------------------------------------------- */
 
 // Note: may still return NULL if committing the memory failed
-static mi_page_t* mi_segment_span_allocate(mi_segment_t* segment, size_t slice_index, size_t slice_count, mi_segments_tld_t* tld) {
+static mi_page_t* mi_segment_span_allocate(mi_segment_t* segment, size_t slice_index, size_t slice_count) {
   mi_assert_internal(slice_index < segment->slice_entries);
   mi_slice_t* const slice = &segment->slices[slice_index];
   mi_assert_internal(slice->block_size==0 || slice->block_size==1);
 
   // commit before changing the slice data
-  if (!mi_segment_ensure_committed(segment, _mi_segment_page_start_from_slice(segment, slice, 0, NULL), slice_count * MI_SEGMENT_SLICE_SIZE, tld->stats)) {
+  if (!mi_segment_ensure_committed(segment, _mi_segment_page_start_from_slice(segment, slice, 0, NULL), slice_count * MI_SEGMENT_SLICE_SIZE)) {
     return NULL;  // commit failed!
   }
 
@@ -787,7 +808,7 @@ static mi_page_t* mi_segments_page_find_and_allocate(size_t slice_count, mi_aren
             mi_segment_slice_split(segment, slice, slice_count, tld);
           }
           mi_assert_internal(slice != NULL && slice->slice_count == slice_count && slice->block_size > 0);
-          mi_page_t* page = mi_segment_span_allocate(segment, mi_slice_index(slice), slice->slice_count, tld);
+          mi_page_t* page = mi_segment_span_allocate(segment, mi_slice_index(slice), slice->slice_count);
           if (page == NULL) {
             // commit failed; return NULL but first restore the slice
             mi_segment_span_free_coalesce(slice, tld);
@@ -810,7 +831,7 @@ static mi_page_t* mi_segments_page_find_and_allocate(size_t slice_count, mi_aren
 
 static mi_segment_t* mi_segment_os_alloc( size_t required, size_t page_alignment, bool eager_delayed, mi_arena_id_t req_arena_id,
                                           size_t* psegment_slices, size_t* pinfo_slices,
-                                          bool commit, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
+                                          bool commit, mi_segments_tld_t* tld)
 
 {
   mi_memid_t memid;
@@ -831,7 +852,7 @@ static mi_segment_t* mi_segment_os_alloc( size_t required, size_t page_alignment
   }
 
   const size_t segment_size = (*psegment_slices) * MI_SEGMENT_SLICE_SIZE;
-  mi_segment_t* segment = (mi_segment_t*)_mi_arena_alloc_aligned(segment_size, alignment, align_offset, commit, allow_large, req_arena_id, &memid, os_tld);
+  mi_segment_t* segment = (mi_segment_t*)_mi_arena_alloc_aligned(segment_size, alignment, align_offset, commit, allow_large, req_arena_id, &memid);
   if (segment == NULL) {
     return NULL;  // failed to allocate
   }
@@ -847,8 +868,8 @@ static mi_segment_t* mi_segment_os_alloc( size_t required, size_t page_alignment
     mi_assert_internal(commit_needed>0);
     mi_commit_mask_create(0, commit_needed, &commit_mask);
     mi_assert_internal(commit_needed*MI_COMMIT_SIZE >= (*pinfo_slices)*MI_SEGMENT_SLICE_SIZE);
-    if (!_mi_os_commit(segment, commit_needed*MI_COMMIT_SIZE, NULL, tld->stats)) {
-      _mi_arena_free(segment,segment_size,0,memid,tld->stats);
+    if (!_mi_os_commit(segment, commit_needed*MI_COMMIT_SIZE, NULL)) {
+      _mi_arena_free(segment,segment_size,0,memid);
       return NULL;
     }
   }
@@ -858,6 +879,7 @@ static mi_segment_t* mi_segment_os_alloc( size_t required, size_t page_alignment
   segment->allow_decommit = !memid.is_pinned;
   segment->allow_purge = segment->allow_decommit && (mi_option_get(mi_option_purge_delay) >= 0);
   segment->segment_size = segment_size;
+  segment->subproc = tld->subproc;
   segment->commit_mask = commit_mask;
   segment->purge_expire = 0;
   mi_commit_mask_create_empty(&segment->purge_mask);
@@ -869,7 +891,7 @@ static mi_segment_t* mi_segment_os_alloc( size_t required, size_t page_alignment
 
 
 // Allocate a segment from the OS aligned to `MI_SEGMENT_SIZE` .
-static mi_segment_t* mi_segment_alloc(size_t required, size_t page_alignment, mi_arena_id_t req_arena_id, mi_segments_tld_t* tld, mi_os_tld_t* os_tld, mi_page_t** huge_page)
+static mi_segment_t* mi_segment_alloc(size_t required, size_t page_alignment, mi_arena_id_t req_arena_id, mi_segments_tld_t* tld, mi_page_t** huge_page)
 {
   mi_assert_internal((required==0 && huge_page==NULL) || (required>0 && huge_page != NULL));
 
@@ -881,13 +903,13 @@ static mi_segment_t* mi_segment_alloc(size_t required, size_t page_alignment, mi
   // Commit eagerly only if not the first N lazy segments (to reduce impact of many threads that allocate just a little)
   const bool eager_delay = (// !_mi_os_has_overcommit() &&             // never delay on overcommit systems
                             _mi_current_thread_count() > 1 &&       // do not delay for the first N threads
-                            tld->count < (size_t)mi_option_get(mi_option_eager_commit_delay));
+                            tld->peak_count < (size_t)mi_option_get(mi_option_eager_commit_delay));
   const bool eager = !eager_delay && mi_option_is_enabled(mi_option_eager_commit);
   bool commit = eager || (required > 0);
 
   // Allocate the segment from the OS
   mi_segment_t* segment = mi_segment_os_alloc(required, page_alignment, eager_delay, req_arena_id,
-                                              &segment_slices, &info_slices, commit, tld, os_tld);
+                                              &segment_slices, &info_slices, commit, tld);
   if (segment == NULL) return NULL;
 
   // zero the segment info? -- not always needed as it may be zero initialized from the OS
@@ -915,17 +937,17 @@ static mi_segment_t* mi_segment_alloc(size_t required, size_t page_alignment, mi
   if (MI_SECURE>0) {
     // in secure mode, we set up a protected page in between the segment info
     // and the page data, and at the end of the segment.
-    size_t os_pagesize = _mi_os_page_size();    
+    size_t os_pagesize = _mi_os_page_size();
     _mi_os_protect((uint8_t*)segment + mi_segment_info_size(segment) - os_pagesize, os_pagesize);
     uint8_t* end = (uint8_t*)segment + mi_segment_size(segment) - os_pagesize;
-    mi_segment_ensure_committed(segment, end, os_pagesize, tld->stats);
+    mi_segment_ensure_committed(segment, end, os_pagesize);
     _mi_os_protect(end, os_pagesize);
     if (slice_entries == segment_slices) segment->slice_entries--; // don't use the last slice :-(
     guard_slices = 1;
   }
 
   // reserve first slices for segment info
-  mi_page_t* page0 = mi_segment_span_allocate(segment, 0, info_slices, tld);
+  mi_page_t* page0 = mi_segment_span_allocate(segment, 0, info_slices);
   mi_assert_internal(page0!=NULL); if (page0==NULL) return NULL; // cannot fail as we always commit in advance
   mi_assert_internal(segment->used == 1);
   segment->used = 0; // don't count our internal slices towards usage
@@ -939,7 +961,7 @@ static mi_segment_t* mi_segment_alloc(size_t required, size_t page_alignment, mi
     mi_assert_internal(huge_page!=NULL);
     mi_assert_internal(mi_commit_mask_is_empty(&segment->purge_mask));
     mi_assert_internal(mi_commit_mask_is_full(&segment->commit_mask));
-    *huge_page = mi_segment_span_allocate(segment, info_slices, segment_slices - info_slices - guard_slices, tld);
+    *huge_page = mi_segment_span_allocate(segment, info_slices, segment_slices - info_slices - guard_slices);
     mi_assert_internal(*huge_page != NULL); // cannot fail as we commit in advance
   }
 
@@ -953,6 +975,9 @@ static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t
   mi_assert_internal(segment != NULL);
   mi_assert_internal(segment->next == NULL);
   mi_assert_internal(segment->used == 0);
+  
+  // in `mi_segment_force_abandon` we set this to true to ensure the segment's memory stays valid
+  if (segment->dont_free) return;
 
   // Remove the free pages
   mi_slice_t* slice = &segment->slices[0];
@@ -1003,7 +1028,7 @@ static mi_slice_t* mi_segment_page_clear(mi_page_t* page, mi_segments_tld_t* tld
   if (segment->allow_decommit && mi_option_is_enabled(mi_option_deprecated_page_reset)) {
     size_t psize;
     uint8_t* start = _mi_segment_page_start(segment, page, &psize);
-    _mi_os_reset(start, psize, tld->stats);
+    _mi_os_reset(start, psize);
   }
 
   // zero the page data, but not the segment fields and heap tag
@@ -1025,7 +1050,6 @@ static mi_slice_t* mi_segment_page_clear(mi_page_t* page, mi_segments_tld_t* tld
 void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld)
 {
   mi_assert(page != NULL);
-
   mi_segment_t* segment = _mi_page_segment(page);
   mi_assert_expensive(mi_segment_is_valid(segment,tld));
 
@@ -1043,7 +1067,7 @@ void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld)
   }
   else {
     // perform delayed purges
-    mi_segment_try_purge(segment, false /* force? */, tld->stats);
+    mi_segment_try_purge(segment, false /* force? */);
   }
 }
 
@@ -1061,16 +1085,11 @@ When a block is freed in an abandoned segment, the segment
 is reclaimed into that thread.
 
 Moreover, if threads are looking for a fresh segment, they
-will first consider abondoned segments -- these can be found
+will first consider abandoned segments -- these can be found
 by scanning the arena memory
 (segments outside arena memoryare only reclaimed by a free).
 ----------------------------------------------------------- */
 
-// legacy: Wait until there are no more pending reads on segments that used to be in the abandoned list
-void _mi_abandoned_await_readers(void) {
-  // nothing needed
-}
-
 /* -----------------------------------------------------------
    Abandon segment/page
 ----------------------------------------------------------- */
@@ -1098,7 +1117,7 @@ static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
   // Only abandoned segments in arena memory can be reclaimed without a free
   // so if a segment is not from an arena we force purge here to be conservative.
   const bool force_purge = (segment->memid.memkind != MI_MEM_ARENA) || mi_option_is_enabled(mi_option_abandoned_page_purge);
-  mi_segment_try_purge(segment, force_purge, tld->stats);
+  mi_segment_try_purge(segment, force_purge);
 
   // all pages in the segment are abandoned; add it to the abandoned list
   _mi_stat_increase(&tld->stats->segments_abandoned, 1);
@@ -1190,6 +1209,7 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
   if (right_page_reclaimed != NULL) { *right_page_reclaimed = false; }
   // can be 0 still with abandoned_next, or already a thread id for segments outside an arena that are reclaimed on a free.
   mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0 || mi_atomic_load_relaxed(&segment->thread_id) == _mi_thread_id());
+  mi_assert_internal(segment->subproc == heap->tld->segments.subproc); // only reclaim within the same subprocess
   mi_atomic_store_release(&segment->thread_id, _mi_thread_id());
   segment->abandoned_visits = 0;
   segment->was_reclaimed = true;
@@ -1213,12 +1233,13 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
       mi_assert_internal(page->next == NULL && page->prev==NULL);
       _mi_stat_decrease(&tld->stats->pages_abandoned, 1);
       segment->abandoned--;
-      // set the heap again and allow heap thread delayed free again.
+      // get the target heap for this thread which has a matching heap tag (so we reclaim into a matching heap)
       mi_heap_t* target_heap = _mi_heap_by_tag(heap, page->heap_tag);  // allow custom heaps to separate objects
       if (target_heap == NULL) {
         target_heap = heap;
-        _mi_error_message(EINVAL, "page with tag %u cannot be reclaimed by a heap with the same tag (using %u instead)\n", page->heap_tag, heap->tag );
+        _mi_error_message(EFAULT, "page with tag %u cannot be reclaimed by a heap with the same tag (using heap tag %u instead)\n", page->heap_tag, heap->tag );
       }
+      // associate the heap with this page, and allow heap thread delayed free again.
       mi_page_set_heap(page, target_heap);
       _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, true); // override never (after heap is set)
       _mi_page_free_collect(page, false); // ensure used count is up to date
@@ -1254,12 +1275,21 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
   }
 }
 
+
 // attempt to reclaim a particular segment (called from multi threaded free `alloc.c:mi_free_block_mt`)
 bool _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment) {
   if (mi_atomic_load_relaxed(&segment->thread_id) != 0) return false;  // it is not abandoned
-  // don't reclaim more from a free than half the current segments
+  if (segment->subproc != heap->tld->segments.subproc)  return false;  // only reclaim within the same subprocess
+  if (!_mi_heap_memid_is_suitable(heap,segment->memid)) return false;  // don't reclaim between exclusive and non-exclusive arena's
+  const long target = _mi_option_get_fast(mi_option_target_segments_per_thread);
+  if (target > 0 && (size_t)target <= heap->tld->segments.count) return false; // don't reclaim if going above the target count
+
+  // don't reclaim more from a `free` call than half the current segments
   // this is to prevent a pure free-ing thread to start owning too many segments
-  if (heap->tld->segments.reclaim_count * 2 > heap->tld->segments.count) return false;
+  // (but not for out-of-arena segments as that is the main way to be reclaimed for those)
+  if (segment->memid.memkind == MI_MEM_ARENA && heap->tld->segments.reclaim_count * 2 > heap->tld->segments.count) {
+    return false;
+  }
   if (_mi_arena_segment_clear_abandoned(segment)) {  // atomically unabandon
     mi_segment_t* res = mi_segment_reclaim(segment, heap, 0, NULL, &heap->tld->segments);
     mi_assert_internal(res == segment);
@@ -1270,17 +1300,26 @@ bool _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment) {
 
 void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld) {
   mi_segment_t* segment;
-  mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap, &current);
+  mi_arena_field_cursor_t current;
+  _mi_arena_field_cursor_init(heap, tld->subproc, true /* visit all, blocking */, &current);
   while ((segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL) {
     mi_segment_reclaim(segment, heap, 0, NULL, tld);
   }
+  _mi_arena_field_cursor_done(&current);
+}
+
+
+static bool segment_count_is_within_target(mi_segments_tld_t* tld, size_t* ptarget) {
+  const size_t target = (size_t)mi_option_get_clamp(mi_option_target_segments_per_thread, 0, 1024);
+  if (ptarget != NULL) { *ptarget = target; }
+  return (target == 0 || tld->count < target);
 }
 
-static long mi_segment_get_reclaim_tries(void) {
+static long mi_segment_get_reclaim_tries(mi_segments_tld_t* tld) {
   // limit the tries to 10% (default) of the abandoned segments with at least 8 and at most 1024 tries.
   const size_t perc = (size_t)mi_option_get_clamp(mi_option_max_segment_reclaim, 0, 100);
   if (perc <= 0) return 0;
-  const size_t total_count = _mi_arena_segment_abandoned_count();
+  const size_t total_count = mi_atomic_load_relaxed(&tld->subproc->abandoned_count);
   if (total_count == 0) return 0;
   const size_t relative_count = (total_count > 10000 ? (total_count / 100) * perc : (total_count * perc) / 100); // avoid overflow
   long max_tries = (long)(relative_count <= 1 ? 1 : (relative_count > 1024 ? 1024 : relative_count));
@@ -1291,15 +1330,18 @@ static long mi_segment_get_reclaim_tries(void) {
 static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t needed_slices, size_t block_size, bool* reclaimed, mi_segments_tld_t* tld)
 {
   *reclaimed = false;
-  long max_tries = mi_segment_get_reclaim_tries();
+  long max_tries = mi_segment_get_reclaim_tries(tld);
   if (max_tries <= 0) return NULL;
 
-  mi_segment_t* segment;
-  mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap, &current);
-  while ((max_tries-- > 0) && ((segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL))
+  mi_segment_t* result = NULL;
+  mi_segment_t* segment = NULL;
+  mi_arena_field_cursor_t current;
+  _mi_arena_field_cursor_init(heap, tld->subproc, false /* non-blocking */, &current);
+  while (segment_count_is_within_target(tld,NULL) && (max_tries-- > 0) && ((segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL))
   {
+    mi_assert(segment->subproc == heap->tld->segments.subproc); // cursor only visits segments in our sub-process
     segment->abandoned_visits++;
-    // todo: should we respect numa affinity for abondoned reclaim? perhaps only for the first visit?
+    // todo: should we respect numa affinity for abandoned reclaim? perhaps only for the first visit?
     // todo: an arena exclusive heap will potentially visit many abandoned unsuitable segments and use many tries
     // Perhaps we can skip non-suitable ones in a better way?
     bool is_suitable = _mi_heap_memid_is_suitable(heap, segment->memid);
@@ -1316,27 +1358,29 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t needed_slice
       // found a large enough free span, or a page of the right block_size with free space
       // we return the result of reclaim (which is usually `segment`) as it might free
       // the segment due to concurrent frees (in which case `NULL` is returned).
-      return mi_segment_reclaim(segment, heap, block_size, reclaimed, tld);
+      result = mi_segment_reclaim(segment, heap, block_size, reclaimed, tld);
+      break;
     }
     else if (segment->abandoned_visits > 3 && is_suitable) {
-      // always reclaim on 3rd visit to limit the abandoned queue length.
+      // always reclaim on 3rd visit to limit the abandoned segment count.
       mi_segment_reclaim(segment, heap, 0, NULL, tld);
     }
     else {
       // otherwise, push on the visited list so it gets not looked at too quickly again
-      mi_segment_try_purge(segment, false /* true force? */, tld->stats); // force purge if needed as we may not visit soon again
+      mi_segment_try_purge(segment, false /* true force? */); // force purge if needed as we may not visit soon again
       _mi_arena_segment_mark_abandoned(segment);
     }
   }
-  return NULL;
+  _mi_arena_field_cursor_done(&current);
+  return result;
 }
 
-
+// collect abandoned segments
 void _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld)
 {
   mi_segment_t* segment;
-  mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap, &current);
-  long max_tries = (force ? (long)_mi_arena_segment_abandoned_count() : 1024);  // limit latency
+  mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap, tld->subproc, force /* blocking? */, &current);
+  long max_tries = (force ? (long)mi_atomic_load_relaxed(&tld->subproc->abandoned_count) : 1024);  // limit latency
   while ((max_tries-- > 0) && ((segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL)) {
     mi_segment_check_free(segment,0,0,tld); // try to free up pages (due to concurrent frees)
     if (segment->used == 0) {
@@ -1348,20 +1392,121 @@ void _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld)
     else {
       // otherwise, purge if needed and push on the visited list
       // note: forced purge can be expensive if many threads are destroyed/created as in mstress.
-      mi_segment_try_purge(segment, force, tld->stats);
+      mi_segment_try_purge(segment, force);
       _mi_arena_segment_mark_abandoned(segment);
     }
   }
+  _mi_arena_field_cursor_done(&current);
+}
+
+/* -----------------------------------------------------------
+   Force abandon a segment that is in use by our thread
+----------------------------------------------------------- */
+
+// force abandon a segment
+static void mi_segment_force_abandon(mi_segment_t* segment, mi_segments_tld_t* tld)
+{
+  mi_assert_internal(!mi_segment_is_abandoned(segment));
+  mi_assert_internal(!segment->dont_free);
+
+  // ensure the segment does not get free'd underneath us (so we can check if a page has been freed in `mi_page_force_abandon`)
+  segment->dont_free = true;
+
+  // for all slices
+  const mi_slice_t* end;
+  mi_slice_t* slice = mi_slices_start_iterate(segment, &end);
+  while (slice < end) {
+    mi_assert_internal(slice->slice_count > 0);
+    mi_assert_internal(slice->slice_offset == 0);
+    if (mi_slice_is_used(slice)) {
+      // ensure used count is up to date and collect potential concurrent frees
+      mi_page_t* const page = mi_slice_to_page(slice);
+      _mi_page_free_collect(page, false);
+      {
+        // abandon the page if it is still in-use (this will free it if possible as well)
+        mi_assert_internal(segment->used > 0);
+        if (segment->used == segment->abandoned+1) {
+          // the last page.. abandon and return as the segment will be abandoned after this
+          // and we should no longer access it.
+          segment->dont_free = false;
+          _mi_page_force_abandon(page);
+          return;
+        }
+        else {
+          // abandon and continue
+          _mi_page_force_abandon(page);
+          // it might be freed, reset the slice (note: relies on coalesce setting the slice_offset)
+          slice = mi_slice_first(slice);
+        }
+      }
+    }
+    slice = slice + slice->slice_count;
+  }
+  segment->dont_free = false;
+  mi_assert(segment->used == segment->abandoned);
+  mi_assert(segment->used == 0);
+  if (segment->used == 0) {  // paranoia
+    // all free now
+    mi_segment_free(segment, false, tld);
+  }
+  else {
+    // perform delayed purges
+    mi_segment_try_purge(segment, false /* force? */);
+  }
+}
+
+
+// try abandon segments.
+// this should be called from `reclaim_or_alloc` so we know all segments are (about) fully in use.
+static void mi_segments_try_abandon_to_target(mi_heap_t* heap, size_t target, mi_segments_tld_t* tld) {
+  if (target <= 1) return;
+  const size_t min_target = (target > 4 ? (target*3)/4 : target);  // 75%
+  // todo: we should maintain a list of segments per thread; for now, only consider segments from the heap full pages
+  for (int i = 0; i < 64 && tld->count >= min_target; i++) {
+    mi_page_t* page = heap->pages[MI_BIN_FULL].first;
+    while (page != NULL && mi_page_block_size(page) > MI_LARGE_OBJ_SIZE_MAX) {
+      page = page->next;
+    }
+    if (page==NULL) {
+      break;
+    }
+    mi_segment_t* segment = _mi_page_segment(page);
+    mi_segment_force_abandon(segment, tld);
+    mi_assert_internal(page != heap->pages[MI_BIN_FULL].first); // as it is just abandoned
+  }
+}
+
+// try abandon segments.
+// this should be called from `reclaim_or_alloc` so we know all segments are (about) fully in use.
+static void mi_segments_try_abandon(mi_heap_t* heap, mi_segments_tld_t* tld) {
+  // we call this when we are about to add a fresh segment so we should be under our target segment count.
+  size_t target = 0;
+  if (segment_count_is_within_target(tld, &target)) return;
+  mi_segments_try_abandon_to_target(heap, target, tld);
+}
+
+void mi_collect_reduce(size_t target_size) mi_attr_noexcept {
+  mi_collect(true);
+  mi_heap_t* heap = mi_heap_get_default();
+  mi_segments_tld_t* tld = &heap->tld->segments;
+  size_t target = target_size / MI_SEGMENT_SIZE;
+  if (target == 0) {
+    target = (size_t)mi_option_get_clamp(mi_option_target_segments_per_thread, 1, 1024);
+  }
+  mi_segments_try_abandon_to_target(heap, target, tld);
 }
 
 /* -----------------------------------------------------------
    Reclaim or allocate
 ----------------------------------------------------------- */
 
-static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t needed_slices, size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
+static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t needed_slices, size_t block_size, mi_segments_tld_t* tld)
 {
   mi_assert_internal(block_size <= MI_LARGE_OBJ_SIZE_MAX);
 
+  // try to abandon some segments to increase reuse between threads
+  mi_segments_try_abandon(heap,tld);
+
   // 1. try to reclaim an abandoned segment
   bool reclaimed;
   mi_segment_t* segment = mi_segment_try_reclaim(heap, needed_slices, block_size, &reclaimed, tld);
@@ -1375,7 +1520,7 @@ static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t needed_
     return segment;
   }
   // 2. otherwise allocate a fresh segment
-  return mi_segment_alloc(0, 0, heap->arena_id, tld, os_tld, NULL);
+  return mi_segment_alloc(0, 0, heap->arena_id, tld, NULL);
 }
 
 
@@ -1383,7 +1528,7 @@ static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t needed_
    Page allocation
 ----------------------------------------------------------- */
 
-static mi_page_t* mi_segments_page_alloc(mi_heap_t* heap, mi_page_kind_t page_kind, size_t required, size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
+static mi_page_t* mi_segments_page_alloc(mi_heap_t* heap, mi_page_kind_t page_kind, size_t required, size_t block_size, mi_segments_tld_t* tld)
 {
   mi_assert_internal(required <= MI_LARGE_OBJ_SIZE_MAX && page_kind <= MI_PAGE_LARGE);
 
@@ -1394,18 +1539,18 @@ static mi_page_t* mi_segments_page_alloc(mi_heap_t* heap, mi_page_kind_t page_ki
   mi_page_t* page = mi_segments_page_find_and_allocate(slices_needed, heap->arena_id, tld); //(required <= MI_SMALL_SIZE_MAX ? 0 : slices_needed), tld);
   if (page==NULL) {
     // no free page, allocate a new segment and try again
-    if (mi_segment_reclaim_or_alloc(heap, slices_needed, block_size, tld, os_tld) == NULL) {
+    if (mi_segment_reclaim_or_alloc(heap, slices_needed, block_size, tld) == NULL) {
       // OOM or reclaimed a good page in the heap
       return NULL;
     }
     else {
       // otherwise try again
-      return mi_segments_page_alloc(heap, page_kind, required, block_size, tld, os_tld);
+      return mi_segments_page_alloc(heap, page_kind, required, block_size, tld);
     }
   }
   mi_assert_internal(page != NULL && page->slice_count*MI_SEGMENT_SLICE_SIZE == page_size);
   mi_assert_internal(_mi_ptr_segment(page)->thread_id == _mi_thread_id());
-  mi_segment_try_purge(_mi_ptr_segment(page), false, tld->stats);
+  mi_segment_try_purge(_mi_ptr_segment(page), false);
   return page;
 }
 
@@ -1415,10 +1560,10 @@ static mi_page_t* mi_segments_page_alloc(mi_heap_t* heap, mi_page_kind_t page_ki
    Huge page allocation
 ----------------------------------------------------------- */
 
-static mi_page_t* mi_segment_huge_page_alloc(size_t size, size_t page_alignment, mi_arena_id_t req_arena_id, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
+static mi_page_t* mi_segment_huge_page_alloc(size_t size, size_t page_alignment, mi_arena_id_t req_arena_id, mi_segments_tld_t* tld)
 {
   mi_page_t* page = NULL;
-  mi_segment_t* segment = mi_segment_alloc(size,page_alignment,req_arena_id,tld,os_tld,&page);
+  mi_segment_t* segment = mi_segment_alloc(size,page_alignment,req_arena_id,tld,&page);
   if (segment == NULL || page==NULL) return NULL;
   mi_assert_internal(segment->used==1);
   mi_assert_internal(mi_page_block_size(page) >= size);
@@ -1440,7 +1585,7 @@ static mi_page_t* mi_segment_huge_page_alloc(size_t size, size_t page_alignment,
     mi_assert_internal(psize - (aligned_p - start) >= size);
     uint8_t* decommit_start = start + sizeof(mi_block_t);              // for the free list
     ptrdiff_t decommit_size = aligned_p - decommit_start;
-    _mi_os_reset(decommit_start, decommit_size, &_mi_stats_main);   // note: cannot use segment_decommit on huge segments
+    _mi_os_reset(decommit_start, decommit_size);   // note: cannot use segment_decommit on huge segments
   }
 
   return page;
@@ -1487,7 +1632,7 @@ void _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_bloc
     if (csize > sizeof(mi_block_t)) {
       csize = csize - sizeof(mi_block_t);
       uint8_t* p = (uint8_t*)block + sizeof(mi_block_t);
-      _mi_os_reset(p, csize, &_mi_stats_main);  // note: cannot use segment_decommit on huge segments
+      _mi_os_reset(p, csize);  // note: cannot use segment_decommit on huge segments
     }
   }
 }
@@ -1496,29 +1641,60 @@ void _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_bloc
 /* -----------------------------------------------------------
    Page allocation and free
 ----------------------------------------------------------- */
-mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
+mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld) {
   mi_page_t* page;
   if mi_unlikely(page_alignment > MI_BLOCK_ALIGNMENT_MAX) {
     mi_assert_internal(_mi_is_power_of_two(page_alignment));
     mi_assert_internal(page_alignment >= MI_SEGMENT_SIZE);
     if (page_alignment < MI_SEGMENT_SIZE) { page_alignment = MI_SEGMENT_SIZE; }
-    page = mi_segment_huge_page_alloc(block_size,page_alignment,heap->arena_id,tld,os_tld);
+    page = mi_segment_huge_page_alloc(block_size,page_alignment,heap->arena_id,tld);
   }
   else if (block_size <= MI_SMALL_OBJ_SIZE_MAX) {
-    page = mi_segments_page_alloc(heap,MI_PAGE_SMALL,block_size,block_size,tld,os_tld);
+    page = mi_segments_page_alloc(heap,MI_PAGE_SMALL,block_size,block_size,tld);
   }
   else if (block_size <= MI_MEDIUM_OBJ_SIZE_MAX) {
-    page = mi_segments_page_alloc(heap,MI_PAGE_MEDIUM,MI_MEDIUM_PAGE_SIZE,block_size,tld, os_tld);
+    page = mi_segments_page_alloc(heap,MI_PAGE_MEDIUM,MI_MEDIUM_PAGE_SIZE,block_size,tld);
   }
   else if (block_size <= MI_LARGE_OBJ_SIZE_MAX) {
-    page = mi_segments_page_alloc(heap,MI_PAGE_LARGE,block_size,block_size,tld, os_tld);
+    page = mi_segments_page_alloc(heap,MI_PAGE_LARGE,block_size,block_size,tld);
   }
   else {
-    page = mi_segment_huge_page_alloc(block_size,page_alignment,heap->arena_id,tld,os_tld);
+    page = mi_segment_huge_page_alloc(block_size,page_alignment,heap->arena_id,tld);
   }
   mi_assert_internal(page == NULL || _mi_heap_memid_is_suitable(heap, _mi_page_segment(page)->memid));
   mi_assert_expensive(page == NULL || mi_segment_is_valid(_mi_page_segment(page),tld));
+  mi_assert_internal(page == NULL || _mi_page_segment(page)->subproc == tld->subproc);
   return page;
 }
 
 
+/* -----------------------------------------------------------
+   Visit blocks in a segment (only used for abandoned segments)
+----------------------------------------------------------- */
+
+static bool mi_segment_visit_page(mi_page_t* page, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
+  mi_heap_area_t area;
+  _mi_heap_area_init(&area, page);
+  if (!visitor(NULL, &area, NULL, area.block_size, arg)) return false;
+  if (visit_blocks) {
+    return _mi_heap_area_visit_blocks(&area, page, visitor, arg);
+  }
+  else {
+    return true;
+  }
+}
+
+bool _mi_segment_visit_blocks(mi_segment_t* segment, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
+  const mi_slice_t* end;
+  mi_slice_t* slice = mi_slices_start_iterate(segment, &end);
+  while (slice < end) {
+    if (mi_slice_is_used(slice)) {
+      mi_page_t* const page = mi_slice_to_page(slice);
+      if (heap_tag < 0 || (int)page->heap_tag == heap_tag) {
+        if (!mi_segment_visit_page(page, visit_blocks, visitor, arg)) return false;
+      }
+    }
+    slice = slice + slice->slice_count;
+  }
+  return true;
+}
diff --git a/third-party/mimalloc/src/static.c b/third-party/mimalloc/src/static.c
index bf025eb794..9e06ce05aa 100644
--- a/third-party/mimalloc/src/static.c
+++ b/third-party/mimalloc/src/static.c
@@ -31,7 +31,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "options.c"
 #include "os.c"
 #include "page.c"           // includes page-queue.c
-#include "random.c" 
+#include "random.c"
 #include "segment.c"
 #include "segment-map.c"
 #include "stats.c"
diff --git a/third-party/mimalloc/src/stats.c b/third-party/mimalloc/src/stats.c
index a936402744..f82055938c 100644
--- a/third-party/mimalloc/src/stats.c
+++ b/third-party/mimalloc/src/stats.c
@@ -26,7 +26,7 @@ static bool mi_is_in_main(void* stat) {
 
 static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {
   if (amount == 0) return;
-  if (mi_is_in_main(stat))
+  if mi_unlikely(mi_is_in_main(stat))
   {
     // add atomically (for abandoned pages)
     int64_t current = mi_atomic_addi64_relaxed(&stat->current, amount);
@@ -51,6 +51,27 @@ static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {
   }
 }
 
+// Adjust stats to compensate; for example before committing a range,
+// first adjust downwards with parts that were already committed so 
+// we avoid double counting.
+static void mi_stat_adjust(mi_stat_count_t* stat, int64_t amount) {
+  if (amount == 0) return;
+  if mi_unlikely(mi_is_in_main(stat))
+  {
+    // adjust atomically 
+    mi_atomic_addi64_relaxed(&stat->current, amount);
+    mi_atomic_addi64_relaxed(&stat->allocated, amount);
+    mi_atomic_addi64_relaxed(&stat->freed, amount);
+  }
+  else {
+    // don't affect the peak
+    stat->current += amount;    
+    // add to both
+    stat->allocated += amount;
+    stat->freed += amount;    
+  }
+}
+
 void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {
   if (mi_is_in_main(stat)) {
     mi_atomic_addi64_relaxed( &stat->count, 1 );
@@ -70,6 +91,14 @@ void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount) {
   mi_stat_update(stat, -((int64_t)amount));
 }
 
+void _mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_adjust(stat, (int64_t)amount);
+}
+
+void _mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_adjust(stat, -((int64_t)amount));
+}
+
 // must be thread safe as it is called from stats_merge
 static void mi_stat_add(mi_stat_count_t* stat, const mi_stat_count_t* src, int64_t unit) {
   if (stat==src) return;
@@ -119,6 +148,7 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
   mi_stat_counter_add(&stats->normal_count, &src->normal_count, 1);
   mi_stat_counter_add(&stats->huge_count, &src->huge_count, 1);
   mi_stat_counter_add(&stats->large_count, &src->large_count, 1);
+  mi_stat_counter_add(&stats->guarded_alloc_count, &src->guarded_alloc_count, 1);
 #if MI_STAT>1
   for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
     if (src->normal_bins[i].allocated > 0 || src->normal_bins[i].freed > 0) {
@@ -345,6 +375,7 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
   mi_stat_counter_print(&stats->commit_calls, "commits", out, arg);
   mi_stat_counter_print(&stats->reset_calls, "resets", out, arg);
   mi_stat_counter_print(&stats->purge_calls, "purges", out, arg);
+  mi_stat_counter_print(&stats->guarded_alloc_count, "guarded", out, arg);
   mi_stat_print(&stats->threads, "threads", -1, out, arg);
   mi_stat_counter_print_avg(&stats->searches, "searches", out, arg);
   _mi_fprintf(out, arg, "%10s: %5zu\n", "numa nodes", _mi_os_numa_node_count());
diff --git a/third-party/mimalloc/test/main-override-static.c b/third-party/mimalloc/test/main-override-static.c
index e71be29e95..1fee245221 100644
--- a/third-party/mimalloc/test/main-override-static.c
+++ b/third-party/mimalloc/test/main-override-static.c
@@ -1,3 +1,6 @@
+#if _WIN32
+#include <windows.h>
+#endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <assert.h>
@@ -12,6 +15,7 @@ static void double_free1();
 static void double_free2();
 static void corrupt_free();
 static void block_overflow1();
+static void block_overflow2();
 static void invalid_free();
 static void test_aslr(void);
 static void test_process_info(void);
@@ -21,15 +25,22 @@ static void alloc_huge(void);
 static void test_heap_walk(void);
 static void test_heap_arena(void);
 static void test_align(void);
+static void test_canary_leak(void);
+static void test_manage_os_memory(void);
+// static void test_large_pages(void);
 
 int main() {
   mi_version();
   mi_stats_reset();
+  test_manage_os_memory();
+  // test_large_pages();
   // detect double frees and heap corruption
   // double_free1();
   // double_free2();
   // corrupt_free();
   // block_overflow1();
+  // block_overflow2();
+  test_canary_leak();
   // test_aslr();
   // invalid_free();
   // test_reserved();
@@ -66,7 +77,7 @@ int main() {
   //mi_stats_print(NULL);
 
   // test_process_info();
-  
+
   return 0;
 }
 
@@ -88,6 +99,12 @@ static void block_overflow1() {
   free(p);
 }
 
+static void block_overflow2() {
+  uint8_t* p = (uint8_t*)mi_malloc(16);
+  p[17] = 0;
+  free(p);
+}
+
 // The double free samples come ArcHeap [1] by Insu Yun (issue #161)
 // [1]: https://arxiv.org/pdf/1903.00503.pdf
 
@@ -242,6 +259,78 @@ static void test_heap_arena(void) {
   }
 }
 
+static void test_canary_leak(void) {
+  char* p = mi_mallocn_tp(char,23);
+  for(int i = 0; i < 23; i++) {
+    p[i] = '0'+i;
+  }
+  puts(p);
+  free(p);
+}
+
+#if _WIN32
+static void test_manage_os_memory(void) {
+  size_t size = 256 * 1024 * 1024;
+  void* ptr = VirtualAlloc(NULL, size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); 
+  mi_arena_id_t arena_id;
+  mi_manage_os_memory_ex(ptr, size, true /* committed */, true /* pinned */, false /* is zero */, -1 /* numa node */, true /* exclusive */, &arena_id);
+  mi_heap_t* cuda_heap = mi_heap_new_in_arena(arena_id);    // you can do this in any thread
+
+  // now allocate only in the cuda arena
+  void* p1 = mi_heap_malloc(cuda_heap, 8);
+  int* p2 = mi_heap_malloc_tp(cuda_heap, int);
+  *p2 = 42;
+  
+  // and maybe set the cuda heap as the default heap? (but careful as now `malloc` will allocate in the cuda heap as well)
+  {
+    mi_heap_t* prev_default_heap = mi_heap_set_default(cuda_heap);
+    void* p3 = mi_malloc(8);  // allocate in the cuda heap 
+    mi_free(p3);
+  }
+  mi_free(p1);
+  mi_free(p2);
+}
+#else
+static void test_manage_os_memory(void) {
+  // empty
+}
+#endif
+
+// Experiment with huge OS pages
+#if 0
+
+#include <mimalloc/types.h>
+#include <mimalloc/internal.h>
+#include <unistd.h>
+#include <sys/mman.h>
+
+static void test_large_pages(void) {
+  mi_memid_t memid;
+
+  #if 0
+  size_t pages_reserved;
+  size_t page_size;
+  uint8_t* p = (uint8_t*)_mi_os_alloc_huge_os_pages(1, -1, 30000, &pages_reserved, &page_size, &memid);
+  const size_t req_size = pages_reserved * page_size;
+  #else
+  const size_t req_size = 64*MI_MiB;
+  uint8_t* p = (uint8_t*)_mi_os_alloc(req_size,&memid,NULL);
+  #endif
+
+  p[0] = 1;
+
+  //_mi_os_protect(p, _mi_os_page_size());
+  //_mi_os_unprotect(p, _mi_os_page_size());
+  //_mi_os_decommit(p, _mi_os_page_size(), NULL);
+  if (madvise(p, req_size, MADV_HUGEPAGE) == 0) {
+    printf("advised huge pages\n");
+    _mi_os_decommit(p, _mi_os_page_size(), NULL);
+  };
+  _mi_os_free(p, req_size, memid, NULL);
+}
+
+#endif
+
 // ----------------------------
 // bin size experiments
 // ------------------------------
@@ -260,11 +349,11 @@ static void test_heap_arena(void) {
 static inline uint8_t mi_bsr32(uint32_t x);
 
 #if defined(_MSC_VER)
-#include <windows.h>
+//#include <Windows.h>
 #include <intrin.h>
 static inline uint8_t mi_bsr32(uint32_t x) {
   uint32_t idx;
-  _BitScanReverse((DWORD*)&idx, x);
+  _BitScanReverse(&idx, x);
   return idx;
 }
 #elif defined(__GNUC__) || defined(__clang__)
@@ -288,7 +377,7 @@ static inline uint8_t mi_bsr32(uint32_t x) {
 }
 #endif
 
-/*
+
 // Bit scan reverse: return the index of the highest bit.
 uint8_t _mi_bsr(uintptr_t x) {
   if (x == 0) return 0;
@@ -301,7 +390,7 @@ uint8_t _mi_bsr(uintptr_t x) {
   # error "define bsr for non-32 or 64-bit platforms"
   #endif
 }
-*/
+
 
 
 static inline size_t _mi_wsize_from_size(size_t size) {
@@ -378,11 +467,20 @@ static inline uint8_t _mi_bin4(size_t size) {
   return bin;
 }
 
-static size_t _mi_binx4(size_t bsize) {
-  if (bsize==0) return 0;
-  uint8_t b = mi_bsr32((uint32_t)bsize);
-  if (b <= 1) return bsize;
-  size_t bin = ((b << 1) | (bsize >> (b - 1))&0x01);
+static size_t _mi_binx4(size_t wsize) {
+  size_t bin;
+  if (wsize <= 1) {
+    bin = 1;
+  }
+  else if (wsize <= 8) {
+    // bin = (wsize+1)&~1; // round to double word sizes
+    bin = (uint8_t)wsize;
+  }
+  else {
+    uint8_t b = mi_bsr32((uint32_t)wsize);
+    if (b <= 1) return wsize;
+    bin = ((b << 1) | (wsize >> (b - 1))&0x01) + 3;
+  }
   return bin;
 }
 
@@ -394,22 +492,40 @@ static size_t _mi_binx8(size_t bsize) {
   return bin;
 }
 
+
+static inline size_t mi_bin(size_t wsize) {
+  uint8_t bin;
+  if (wsize <= 1) {
+    bin = 1;
+  }
+  else if (wsize <= 8) {
+    // bin = (wsize+1)&~1; // round to double word sizes
+    bin = (uint8_t)wsize;
+  }
+  else {
+    wsize--;
+    // find the highest bit
+    uint8_t b = (uint8_t)mi_bsr32((uint32_t)wsize);  // note: wsize != 0
+    // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation).
+    // - adjust with 3 because we use do not round the first 8 sizes
+    //   which each get an exact bin
+    bin = ((b << 2) + (uint8_t)((wsize >> (b - 2)) & 0x03)) - 3;
+  }
+  return bin;
+}
+
+
 static void mi_bins(void) {
   //printf("  QNULL(1), /* 0 */ \\\n  ");
   size_t last_bin = 0;
-  size_t min_bsize = 0;
-  size_t last_bsize = 0;
-  for (size_t bsize = 1; bsize < 2*1024; bsize++) {
-    size_t size = bsize * 64 * 1024;
-    size_t bin = _mi_binx8(bsize);
+  for (size_t wsize = 1; wsize <= (4*1024*1024) / 8 + 1024; wsize++) {
+    size_t bin = mi_bin(wsize);
     if (bin != last_bin) {
-      printf("min bsize: %6zd, max bsize: %6zd, bin: %6zd\n", min_bsize, last_bsize, last_bin);
-      //printf("QNULL(%6zd), ", wsize);
-      //if (last_bin%8 == 0) printf("/* %i */ \\\n  ", last_bin);
+      //printf("min bsize: %6zd, max bsize: %6zd, bin: %6zd\n", min_wsize, last_wsize, last_bin);
+      printf("QNULL(%6zd), ", wsize-1);
+      if (last_bin%8 == 0) printf("/* %zu */ \\\n  ", last_bin);
       last_bin = bin;
-      min_bsize = bsize;
     }
-    last_bsize = bsize;
   }
 }
 #endif
diff --git a/third-party/mimalloc/test/main-override.cpp b/third-party/mimalloc/test/main-override.cpp
index 582f24ee92..0fbb58e877 100644
--- a/third-party/mimalloc/test/main-override.cpp
+++ b/third-party/mimalloc/test/main-override.cpp
@@ -11,7 +11,7 @@
 #include <iostream>
 
 #include <thread>
-#include <mimalloc.h>
+//#include <mimalloc.h>
 #include <assert.h>
 
 #ifdef _WIN32
@@ -19,7 +19,7 @@
 #endif
 
 #ifdef _WIN32
-#include <Windows.h>
+#include <windows.h>
 static void msleep(unsigned long msecs) { Sleep(msecs); }
 #else
 #include <unistd.h>
@@ -35,19 +35,24 @@ static void test_mt_shutdown();
 static void large_alloc(void);        // issue #363
 static void fail_aslr();              // issue #372
 static void tsan_numa_test();         // issue #414
-static void strdup_test();            // issue #445 
+static void strdup_test();            // issue #445
 static void bench_alloc_large(void);  // issue #xxx
 //static void test_large_migrate(void); // issue #691
 static void heap_thread_free_huge();
 static void test_std_string();        // issue #697
-
+static void test_thread_local();      // issue #944
+// static void test_mixed0();             // issue #942
+static void test_mixed1();             // issue #942
 static void test_stl_allocators();
 
 
 int main() {
-  // mi_stats_reset();  // ignore earlier allocations
+  mi_stats_reset();  // ignore earlier allocations
+  various_tests();
+  test_mixed1();
   
-  // test_std_string();
+  //test_std_string();
+  //test_thread_local();
   // heap_thread_free_huge();
   /*
    heap_thread_free_huge();
@@ -63,10 +68,9 @@ int main() {
   // test_stl_allocators();
   // test_mt_shutdown();
   // test_large_migrate();
-  
+
   //fail_aslr();
-  // bench_alloc_large();
-  // mi_stats_print(NULL);
+  mi_stats_print(NULL);
   return 0;
 }
 
@@ -109,6 +113,9 @@ static void various_tests() {
   t = new (tbuf) Test(42);
   t->~Test();
   delete[] tbuf;
+
+  const char* ptr = ::_Getdays();  // test _base overrid
+  free((void*)ptr);
 }
 
 class Static {
@@ -185,6 +192,53 @@ static void test_stl_allocators() {
 #endif
 }
 
+#if 0
+#include <algorithm>
+#include <chrono>
+#include <functional>
+#include <iostream>
+#include <thread>
+#include <vector>
+
+static void test_mixed0() {
+    std::vector<std::unique_ptr<std::size_t>> numbers(1024 * 1024 * 100);
+    std::vector<std::thread> threads(1);
+
+    std::atomic<std::size_t> index{};
+
+    auto start = std::chrono::system_clock::now();
+
+    for (auto& thread : threads) {
+        thread = std::thread{[&index, &numbers]() {
+            while (true) {
+                auto i = index.fetch_add(1, std::memory_order_relaxed);
+                if (i >= numbers.size()) return;
+
+                numbers[i] = std::make_unique<std::size_t>(i);
+            }
+        }};
+    }
+
+    for (auto& thread : threads) thread.join();
+
+    auto end = std::chrono::system_clock::now();
+
+    auto duration =
+        std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+    std::cout << "Running on " << threads.size() << " threads took " << duration
+              << std::endl;
+}
+#endif
+
+void asd() {
+  void* p = malloc(128);
+  free(p);
+}
+static void test_mixed1() {
+    std::thread thread(asd);
+    thread.join();
+}
+
 #if 0
 // issue #691
 static char* cptr;
@@ -398,3 +452,30 @@ static void bench_alloc_large(void) {
   std::cout << "Avg " << us_per_allocation << " us per allocation" << std::endl;
 }
 
+
+class MTest
+{
+    char *data;
+public:
+    MTest() { data = (char*)malloc(1024); }
+    ~MTest() { free(data); };
+};
+
+thread_local MTest tlVariable;
+
+void threadFun( int i )
+{
+    printf( "Thread %d\n", i );
+    std::this_thread::sleep_for( std::chrono::milliseconds(100) );
+}
+
+void test_thread_local()
+{
+    for( int i=1; i < 100; ++i )
+    {
+        std::thread t( threadFun, i );
+        t.join();
+        mi_stats_print(NULL);
+    }
+    return;
+}
diff --git a/third-party/mimalloc/test/test-api-fill.c b/third-party/mimalloc/test/test-api-fill.c
index 3fca3b9d43..eebbd394ef 100644
--- a/third-party/mimalloc/test/test-api-fill.c
+++ b/third-party/mimalloc/test/test-api-fill.c
@@ -271,7 +271,7 @@ int main(void) {
     mi_free(p);
   };
 
-  #if !(MI_TRACK_VALGRIND || MI_TRACK_ASAN)
+  #if !(MI_TRACK_VALGRIND || MI_TRACK_ASAN || MI_GUARDED)
   CHECK_BODY("fill-freed-small") {
     size_t malloc_size = MI_SMALL_SIZE_MAX / 2;
     uint8_t* p = (uint8_t*)mi_malloc(malloc_size);
diff --git a/third-party/mimalloc/test/test-api.c b/third-party/mimalloc/test/test-api.c
index 76101980be..154845447f 100644
--- a/third-party/mimalloc/test/test-api.c
+++ b/third-party/mimalloc/test/test-api.c
@@ -65,6 +65,15 @@ bool mem_is_zero(uint8_t* p, size_t size) {
 int main(void) {
   mi_option_disable(mi_option_verbose);
 
+  CHECK_BODY("malloc-aligned9a") { // test large alignments
+    void* p = mi_zalloc_aligned(1024 * 1024, 2);
+    mi_free(p);
+    p = mi_zalloc_aligned(1024 * 1024, 2);
+    mi_free(p);
+    result = true;
+  };
+  
+
   // ---------------------------------------------------
   // Malloc
   // ---------------------------------------------------
@@ -157,6 +166,7 @@ int main(void) {
     printf("malloc_aligned5: usable size: %zi\n", usable);
     mi_free(p);
   };
+  /*
   CHECK_BODY("malloc-aligned6") {
     bool ok = true;
     for (size_t align = 1; align <= MI_BLOCK_ALIGNMENT_MAX && ok; align *= 2) {
@@ -174,6 +184,7 @@ int main(void) {
     }
     result = ok;
   };
+  */
   CHECK_BODY("malloc-aligned7") {
     void* p = mi_malloc_aligned(1024,MI_BLOCK_ALIGNMENT_MAX);
     mi_free(p);
@@ -189,7 +200,7 @@ int main(void) {
     }
     result = ok;
   };
-  CHECK_BODY("malloc-aligned9") {
+  CHECK_BODY("malloc-aligned9") { // test large alignments
     bool ok = true;
     void* p[8];
     size_t sizes[8] = { 8, 512, 1024 * 1024, MI_BLOCK_ALIGNMENT_MAX, MI_BLOCK_ALIGNMENT_MAX + 1, 2 * MI_BLOCK_ALIGNMENT_MAX, 8 * MI_BLOCK_ALIGNMENT_MAX, 0 };
diff --git a/third-party/mimalloc/test/test-stress.c b/third-party/mimalloc/test/test-stress.c
index 15d0e25bf0..c0d9761ac5 100644
--- a/third-party/mimalloc/test/test-stress.c
+++ b/third-party/mimalloc/test/test-stress.c
@@ -22,29 +22,39 @@ terms of the MIT license.
 #include <string.h>
 #include <assert.h>
 
+// #define MI_GUARDED
+// #define USE_STD_MALLOC
+
 // > mimalloc-test-stress [THREADS] [SCALE] [ITER]
 //
 // argument defaults
-static int THREADS = 32;      // more repeatable if THREADS <= #processors
-static int SCALE   = 25;      // scaling factor
-
-#if defined(MI_TSAN)
-static int ITER    = 10;      // N full iterations destructing and re-creating all threads (on tsan reduce for azure pipeline limits)
+#if defined(MI_TSAN)          // with thread-sanitizer reduce the threads to test within the azure pipeline limits
+static int THREADS = 8;
+static int SCALE   = 25;
+static int ITER    = 400;
+#elif defined(MI_UBSAN)       // with undefined behavious sanitizer reduce parameters to stay within the azure pipeline limits
+static int THREADS = 8;
+static int SCALE   = 25;
+static int ITER    = 20;
+#elif defined(MI_GUARDED)     // with debug guard pages reduce parameters to stay within the azure pipeline limits
+static int THREADS = 8;
+static int SCALE   = 10;
+static int ITER    = 10;
 #else
+static int THREADS = 32;      // more repeatable if THREADS <= #processors
+static int SCALE   = 50;      // scaling factor
 static int ITER    = 50;      // N full iterations destructing and re-creating all threads
 #endif
 
-// static int THREADS = 8;    // more repeatable if THREADS <= #processors
-// static int SCALE   = 100;  // scaling factor
+
 
 #define STRESS                // undefine for leak test
 
-static bool   allow_large_objects = true;     // allow very large objects? (set to `true` if SCALE>100)
+static bool   allow_large_objects = false;     // allow very large objects? (set to `true` if SCALE>100)
 static size_t use_one_size = 0;               // use single object size of `N * sizeof(uintptr_t)`?
 
 static bool   main_participates = false;       // main thread participates as a worker too
 
-// #define USE_STD_MALLOC
 #ifdef USE_STD_MALLOC
 #define custom_calloc(n,s)    calloc(n,s)
 #define custom_realloc(p,s)   realloc(p,s)
@@ -54,6 +64,10 @@ static bool   main_participates = false;       // main thread participates as a
 #define custom_calloc(n,s)    mi_calloc(n,s)
 #define custom_realloc(p,s)   mi_realloc(p,s)
 #define custom_free(p)        mi_free(p)
+
+#ifndef NDEBUG
+#define HEAP_WALK             // walk the heap objects?
+#endif
 #endif
 
 // transfer pointer between threads
@@ -129,6 +143,16 @@ static void free_items(void* p) {
   custom_free(p);
 }
 
+#ifdef HEAP_WALK
+static bool visit_blocks(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg) {
+  (void)(heap); (void)(area);
+  size_t* total = (size_t*)arg;
+  if (block != NULL) {
+    *total += block_size;
+  }
+  return true;
+}
+#endif
 
 static void stress(intptr_t tid) {
   //bench_start_thread();
@@ -173,6 +197,13 @@ static void stress(intptr_t tid) {
       data[data_idx] = q;
     }
   }
+
+  #ifdef HEAP_WALK
+  // walk the heap
+  size_t total = 0;
+  mi_heap_visit_blocks(mi_heap_get_default(), true, visit_blocks, &total);
+  #endif
+
   // free everything that is left
   for (size_t i = 0; i < retain_top; i++) {
     free_items(retained[i]);
@@ -190,7 +221,15 @@ static void run_os_threads(size_t nthreads, void (*entry)(intptr_t tid));
 static void test_stress(void) {
   uintptr_t r = rand();
   for (int n = 0; n < ITER; n++) {
-    run_os_threads(THREADS, &stress);    
+    run_os_threads(THREADS, &stress);
+    #if !defined(NDEBUG) && !defined(USE_STD_MALLOC)
+    // switch between arena and OS allocation for testing
+    // mi_option_set_enabled(mi_option_disallow_arena_alloc, (n%2)==1);
+    #endif
+    #ifdef HEAP_WALK
+    size_t total = 0;
+    mi_abandoned_visit_blocks(mi_subproc_main(), -1, true, visit_blocks, &total);
+    #endif
     for (int i = 0; i < TRANSFERS; i++) {
       if (chance(50, &r) || n + 1 == ITER) { // free all on last run, otherwise free half of the transfers
         void* p = atomic_exchange_ptr(&transfer[i], NULL);
@@ -199,8 +238,8 @@ static void test_stress(void) {
     }
     #ifndef NDEBUG
     //mi_collect(false);
-    //mi_debug_show_arenas();
-    #endif    
+    //mi_debug_show_arenas(true);
+    #endif
     #if !defined(NDEBUG) || defined(MI_TSAN)
     if ((n + 1) % 10 == 0) { printf("- iterations left: %3d\n", ITER - (n + 1)); }
     #endif
@@ -230,9 +269,15 @@ static void test_leak(void) {
 #endif
 
 int main(int argc, char** argv) {
+  #ifdef HEAP_WALK
+    mi_option_enable(mi_option_visit_abandoned);
+  #endif
+  #if !defined(NDEBUG) && !defined(USE_STD_MALLOC)
+    mi_option_set(mi_option_arena_reserve, 32 * 1024 /* in kib = 32MiB */);
+  #endif
   #ifndef USE_STD_MALLOC
     mi_stats_reset();
-  #endif  
+  #endif
 
   // > mimalloc-test-stress [THREADS] [SCALE] [ITER]
   if (argc >= 2) {
@@ -277,11 +322,11 @@ int main(int argc, char** argv) {
 
 #ifndef USE_STD_MALLOC
   #ifndef NDEBUG
-  // mi_collect(true);
-  mi_debug_show_arenas(true,true,true);
-  #endif
-  mi_stats_print(NULL);
+  mi_debug_show_arenas(true);
+  mi_collect(true);
+  #endif  
 #endif
+  mi_stats_print(NULL);
   //bench_end_program();
   return 0;
 }
@@ -291,7 +336,7 @@ static void (*thread_entry_fun)(intptr_t) = &stress;
 
 #ifdef _WIN32
 
-#include <Windows.h>
+#include <windows.h>
 
 static DWORD WINAPI thread_entry(LPVOID param) {
   thread_entry_fun((intptr_t)param);
diff --git a/third-party/tbb/.bazelversion b/third-party/tbb/.bazelversion
index 21c8c7b46b..b26a34e470 100644
--- a/third-party/tbb/.bazelversion
+++ b/third-party/tbb/.bazelversion
@@ -1 +1 @@
-7.1.1
+7.2.1
diff --git a/third-party/tbb/.github/ISSUE_TEMPLATE/1_question.md b/third-party/tbb/.github/ISSUE_TEMPLATE/1_question.md
new file mode 100644
index 0000000000..bee63c246b
--- /dev/null
+++ b/third-party/tbb/.github/ISSUE_TEMPLATE/1_question.md
@@ -0,0 +1,7 @@
+---
+name: Ask a question
+about: Use this template for any questions
+title: ''
+labels: 'question'
+assignees: ''
+---
\ No newline at end of file
diff --git a/third-party/tbb/.github/ISSUE_TEMPLATE/2_bug_report.md b/third-party/tbb/.github/ISSUE_TEMPLATE/2_bug_report.md
new file mode 100644
index 0000000000..4e135815ca
--- /dev/null
+++ b/third-party/tbb/.github/ISSUE_TEMPLATE/2_bug_report.md
@@ -0,0 +1,32 @@
+---
+name: Report a bug or a performance issue
+about: Use this template to report unexpected behavior
+title: ''
+labels: 'bug'
+assignees: ''
+---
+
+# Summary
+Provide a short summary of the issue. 
+See the sections below
+for factors important for the reproduction of an issue.
+
+# Version
+Report oneTBB version used to reproduce the problem.
+
+# Environment
+Provide any environmental details that you consider significant for reproducing the issue.
+The following information is important:
+* Hardware
+* OS name and version
+* Compiler version
+
+# Observed Behavior
+Document behavior you observe.
+
+# Expected Behavior
+Document behavior you expect.
+
+# Steps To Reproduce
+Check that the issue is reproducible with the latest revision
+of the master branch. Include all the steps to reproduce the issue.
\ No newline at end of file
diff --git a/third-party/tbb/.github/ISSUE_TEMPLATE/3_feature_request.md b/third-party/tbb/.github/ISSUE_TEMPLATE/3_feature_request.md
new file mode 100644
index 0000000000..c4f8cfcbb3
--- /dev/null
+++ b/third-party/tbb/.github/ISSUE_TEMPLATE/3_feature_request.md
@@ -0,0 +1,19 @@
+---
+name: Request a feature
+about: Use this template to request new functionality or change the behavior of the library
+title: ''
+labels: 'new feature'
+assignees: ''
+---
+
+# Summary
+Include a short summary of the request. 
+
+See the sections below
+for factors important for a feature request.
+
+# Problem Statement
+Describe the problem you want to solve with a reasonable level of detail.
+
+# Preferred Solution
+Provide your ideas regarding problem solutions.
\ No newline at end of file
diff --git a/third-party/tbb/.github/ISSUE_TEMPLATE/4_documentation.md b/third-party/tbb/.github/ISSUE_TEMPLATE/4_documentation.md
new file mode 100644
index 0000000000..3788d13b89
--- /dev/null
+++ b/third-party/tbb/.github/ISSUE_TEMPLATE/4_documentation.md
@@ -0,0 +1,20 @@
+---
+name: Request a documentation change
+about: Use this template to report documentation issue or request documentation changes
+title: ''
+labels: 'documentation'
+assignees: ''
+---
+
+# Summary
+Include a short summary of the issue or request. 
+See the sections below
+for factors important for a documentation
+issue.
+
+# URLs
+Include pointers to documents that are impacted.
+
+# Additional Details
+Provide a detailed description of the expected changes in documentation
+and suggestions you have.
\ No newline at end of file
diff --git a/third-party/tbb/.github/pull_request_template.md b/third-party/tbb/.github/pull_request_template.md
index caf80fff86..f986d31a40 100644
--- a/third-party/tbb/.github/pull_request_template.md
+++ b/third-party/tbb/.github/pull_request_template.md
@@ -4,8 +4,6 @@ _Add a comprehensive description of proposed changes_
 
 Fixes # - _issue number(s) if exists_
 
-- [ ] - git commit message contains an appropriate signed-off-by string _(see [CONTRIBUTING.md](https://github.com/oneapi-src/oneTBB/blob/master/CONTRIBUTING.md#pull-requests) for details)_
-
 ### Type of change
 
 _Choose one or multiple, leave empty if none of the other choices apply_
diff --git a/third-party/tbb/.github/workflows/ci.yml b/third-party/tbb/.github/workflows/ci.yml
index a65de62241..7dbf3c407d 100644
--- a/third-party/tbb/.github/workflows/ci.yml
+++ b/third-party/tbb/.github/workflows/ci.yml
@@ -37,7 +37,7 @@ jobs:
     runs-on: [ubuntu-20.04]
     timeout-minutes: 10
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Run scan
         run: |
           sudo apt update && sudo apt install -y codespell
@@ -47,7 +47,7 @@ jobs:
     runs-on: [ubuntu-20.04]
     timeout-minutes: 10
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Run scan
         run: |
           command -v clang-format-10
@@ -62,7 +62,7 @@ jobs:
     runs-on: [ubuntu-22.04]
     timeout-minutes: 10
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Install prerequisites
         run: |
           pip3 install -U Jinja2
@@ -90,7 +90,7 @@ jobs:
     needs: [documentation]
     steps:
       - name: Checkout gh-pages
-        uses: actions/checkout@v2
+        uses: actions/checkout@v4
         with:
           ref: gh-pages
           path: gh-pages
@@ -117,7 +117,7 @@ jobs:
     if: ${{ github.ref != 'refs/heads/master' }}
     runs-on: [ubuntu-20.04]
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
         with:
           fetch-depth: 0
       - name: Run check
@@ -137,7 +137,7 @@ jobs:
     runs-on: [ubuntu-latest]
     timeout-minutes: 15
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Run testing
         run: |
           mkdir build && cd build
@@ -179,7 +179,7 @@ jobs:
             preview: 'ON'
             cmake_static: -DBUILD_SHARED_LIBS=OFF
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Run testing
         shell: bash
         run: |
@@ -212,7 +212,7 @@ jobs:
             preview: 'ON'
             cmake_static: -DBUILD_SHARED_LIBS=OFF
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Run testing
         shell: bash
         run: |
@@ -257,7 +257,7 @@ jobs:
             preview: 'OFF'
             job_name: windows_cl2022_cxx17_relwithdebinfo_preview=OFF
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Run testing
         run: |
           mkdir build
@@ -295,7 +295,7 @@ jobs:
             build_type: debug
             preview: 'ON'
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Run testing
         shell: bash
         run: |
@@ -321,7 +321,7 @@ jobs:
             build_type: relwithdebinfo
             preview: 'ON'
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Run testing
         shell: bash
         run: |
@@ -357,7 +357,7 @@ jobs:
             preview: 'OFF'
             job_name: examples_windows_cl2022_cxx17_relwithdebinfo_preview=OFF
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Run testing
         run: |
           mkdir build
diff --git a/third-party/tbb/.github/workflows/codeql.yml b/third-party/tbb/.github/workflows/codeql.yml
new file mode 100644
index 0000000000..7a80c5f0e2
--- /dev/null
+++ b/third-party/tbb/.github/workflows/codeql.yml
@@ -0,0 +1,86 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: "CodeQL"
+
+on:
+  push:
+    branches: [ "master" ]
+  pull_request:
+    branches: [ "master" ]
+  schedule:
+    - cron: '0 0 * * 1'
+
+permissions:
+  contents: read
+  
+jobs:
+  analyze:
+    name: Analyze (${{ matrix.language }})
+    runs-on: ubuntu-latest
+    # timeout-minutes: 
+    permissions:
+      # required for all workflows
+      security-events: write
+      # required to fetch internal or private CodeQL packs
+      packages: read
+      # only required for workflows in private repositories
+      actions: read
+      contents: read
+
+    strategy:
+      fail-fast: false
+      matrix:
+        language: ["cpp", "python"]
+    
+    steps:
+      - name: Harden Runner
+        uses: step-security/harden-runner@v2.6.1
+        with:
+          egress-policy: audit
+          
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+    # Initializes the CodeQL tools for scanning.
+      - name: Initialize CodeQL
+        uses: github/codeql-action/init@v3
+        with:
+          languages: ${{ matrix.language }}
+ 
+    # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
+    # If this step fails, then you should remove it and run the build manually (see below)
+      - name: Autobuild
+        uses: github/codeql-action/autobuild@v3.24.10
+    
+    # If the analyze step fails for one of the languages you are analyzing with
+    # "We were unable to automatically build your code", modify the matrix above
+    # to set the build mode to "manual" for that language. Then modify this step
+    # to build your code.
+    # Command-line programs to run using the OS shell.
+    # See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
+    #- if: matrix.build-mode == 'manual'
+    #  shell: bash
+    #  run: |
+    #    echo 'If you are using a "manual" build mode for one or more of the' \
+    #      'languages you are analyzing, replace this with the commands to build' \
+    #      'your code, for example:'
+    #    echo '  make bootstrap'
+    #    echo '  make release'
+    #    exit 1
+
+      - name: Perform CodeQL Analysis
+        uses: github/codeql-action/analyze@v3
+        with:
+          category: "/language:${{matrix.language}}"
diff --git a/third-party/tbb/.github/workflows/ossf-scorecard.yml b/third-party/tbb/.github/workflows/ossf-scorecard.yml
new file mode 100644
index 0000000000..9f45569f8a
--- /dev/null
+++ b/third-party/tbb/.github/workflows/ossf-scorecard.yml
@@ -0,0 +1,83 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: OSSF Scorecard
+on:
+  # For Branch-Protection check. Only the default branch is supported. See
+  # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
+  branch_protection_rule:
+  # To guarantee Maintained check is occasionally updated. See
+  # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
+  schedule:
+    - cron: '00 02 * * *'
+  push:
+    branches: [ "master" ]
+
+# Declare default permissions as read only.
+permissions: read-all
+
+jobs:
+  analysis:
+    name: Scorecard analysis
+    runs-on: ubuntu-latest
+    permissions:
+      # Needed to upload the results to code-scanning dashboard.
+      security-events: write
+      # Needed to publish results and get a badge (see publish_results below).
+      id-token: write
+      # Uncomment the permissions below if installing in a private repository.
+      # contents: read
+      # actions: read
+
+    steps:
+      - name: "Checkout code"
+        uses: actions/checkout@v4.1.1
+        with:
+          persist-credentials: false
+
+      - name: "Run analysis"
+        uses: ossf/scorecard-action@v2.3.1
+        with:
+          results_file: results.sarif
+          results_format: sarif
+          # (Optional) "write" PAT token. Uncomment the `repo_token` line below if:
+          # - you want to enable the Branch-Protection check on a *public* repository, or
+          # - you are installing Scorecard on a *private* repository
+          # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action?tab=readme-ov-file#authentication-with-fine-grained-pat-optional.
+          # repo_token: ${{ secrets.SCORECARD_TOKEN }}
+
+          # Public repositories:
+          #   - Publish results to OpenSSF REST API for easy access by consumers
+          #   - Allows the repository to include the Scorecard badge.
+          #   - See https://github.com/ossf/scorecard-action#publishing-results.
+          # For private repositories:
+          #   - `publish_results` will always be set to `false`, regardless
+          #     of the value entered here.
+          publish_results: true
+
+      # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
+      # format to the repository Actions tab.
+      #- name: "Upload artifact"
+      #  uses: actions/upload-artifact@97a0fba1372883ab732affbe8f94b823f91727db # v3.pre.node20
+      #  with:
+      #    name: SARIF file
+      #    path: results.sarif
+      #    retention-days: 5
+
+      # Upload the results to GitHub's code scanning dashboard (optional).
+      # Commenting out will disable upload of results to your repo's Code Scanning dashboard
+      #- name: "Upload to code-scanning"
+      #  uses: github/codeql-action/upload-sarif@1b1aada464948af03b950897e5eb522f92603cc2 # v3.24.9
+      #  with:
+      #    sarif_file: results.sarif
diff --git a/third-party/tbb/BUILD.bazel b/third-party/tbb/BUILD.bazel
index 34f98eba10..9073f4640d 100644
--- a/third-party/tbb/BUILD.bazel
+++ b/third-party/tbb/BUILD.bazel
@@ -117,6 +117,47 @@ cc_library(
     ],
 )
 
+cc_test(
+    name = "test_mutex",
+    srcs = [
+        "test/tbb/test_mutex.cpp",
+        "test/tbb/test_mutex.h"
+    ] + glob([
+        "test/common/*.h",
+    ]),
+    includes = ["test"],
+    deps = [
+        ":tbb",
+    ],
+)
+
+cc_test(
+    name = "test_parallel_for",
+    srcs = [
+        "test/tbb/test_parallel_for.cpp",
+        "test/tbb/test_partitioner.h"
+    ] + glob([
+        "test/common/*.h",
+    ]),
+    includes = ["test"],
+    deps = [
+        ":tbb",
+    ],
+)
+
+cc_test(
+    name = "test_parallel_reduce",
+    srcs = [
+        "test/tbb/test_parallel_reduce.cpp",
+    ] + glob([
+        "test/common/*.h",
+    ]),
+    includes = ["test"],
+    deps = [
+        ":tbb",
+    ],
+)
+
 cc_test(
     name = "test_task",
     srcs = [
diff --git a/third-party/tbb/Bazel.md b/third-party/tbb/Bazel.md
index 996a3b2eb5..09a630a72b 100644
--- a/third-party/tbb/Bazel.md
+++ b/third-party/tbb/Bazel.md
@@ -19,6 +19,8 @@ The standard Bazel approach to handling third-party libraries is static linking.
 
 ## Using oneTBB as a dependency
 
+### Traditional WORKSPACE approach
+
 This example demonstrates how to use oneTBB as a dependency within a Bazel project.
 
 The following file structure is assumed:
@@ -78,6 +80,16 @@ The expected output of this program is the current version of oneTBB.
 
 Switch to the folder with the files created earlier and run the binary with `bazel run //:Demo`.
 
+### Bzlmod
+
+If you use Bzlmod, you can fetch oneTBB with the [Bazel Central Registry](https://registry.bazel.build/).
+
+Add the following line to your `MODULE.bazel` file:
+
+```bazel
+bazel_dep(name = "onetbb", version = "2021.13.0")
+```
+
 ## Build oneTBB using Bazel
 
 Run ```bazel build //...``` in the oneTBB root directory.
diff --git a/third-party/tbb/CMakeLists.txt b/third-party/tbb/CMakeLists.txt
index 19232a9920..811a3a5549 100644
--- a/third-party/tbb/CMakeLists.txt
+++ b/third-party/tbb/CMakeLists.txt
@@ -49,7 +49,7 @@ string(REGEX REPLACE ".*#define TBB_VERSION_MINOR ([0-9]+).*" "\\1" _tbb_ver_min
 string(REGEX REPLACE ".*#define TBB_VERSION_PATCH ([0-9]+).*" "\\1" _tbb_ver_patch "${_tbb_version_info}")
 string(REGEX REPLACE ".*#define TBB_INTERFACE_VERSION ([0-9]+).*" "\\1" TBB_INTERFACE_VERSION "${_tbb_version_info}")
 string(REGEX REPLACE ".*#define __TBB_BINARY_VERSION ([0-9]+).*" "\\1" TBB_BINARY_VERSION "${_tbb_version_info}")
-set(TBB_BINARY_MINOR_VERSION ${_tbb_ver_minor})
+string(REGEX REPLACE "..(..)." "\\1" TBB_BINARY_MINOR_VERSION "${TBB_INTERFACE_VERSION}")
 set(TBBMALLOC_BINARY_VERSION 2)
 set(TBBBIND_BINARY_VERSION 3)
 
@@ -107,6 +107,9 @@ option(TBB_DISABLE_HWLOC_AUTOMATIC_SEARCH "Disable HWLOC automatic search by pkg
 option(TBB_ENABLE_IPO "Enable Interprocedural Optimization (IPO) during the compilation" ON)
 option(TBB_FUZZ_TESTING "Enable fuzz testing" OFF)
 option(TBB_INSTALL "Enable installation" ON)
+if(LINUX)
+option(TBB_LINUX_SEPARATE_DBG "Enable separation of the debug symbols during the build" OFF)
+endif()
 if(APPLE)
 option(TBB_BUILD_APPLE_FRAMEWORKS "Build as Apple Frameworks" OFF)
 endif()
diff --git a/third-party/tbb/CODEOWNERS b/third-party/tbb/CODEOWNERS
new file mode 100644
index 0000000000..78105ac7e8
--- /dev/null
+++ b/third-party/tbb/CODEOWNERS
@@ -0,0 +1,27 @@
+# Where component owners are known, add them here.
+
+/oneTBB/src/tbb/ @pavelkumbrasev
+/oneTBB/src/tbb/ @dnmokhov
+/oneTBB/src/tbb/ @JhaShweta1
+/oneTBB/src/tbb/ @sarathnandu
+/oneTBB/include/oneapi/tbb/parallel_* @pavelkumbrasev
+/oneTBB/include/oneapi/tbb/concurrent_* @kboyarinov
+/oneTBB/include/oneapi/tbb/flow_graph* @kboyarinov
+/oneTBB/include/oneapi/tbb/flow_graph* @aleksei-fedotov
+/oneTBB/include/oneapi/tbb/detail/_flow_graph* @kboyarinov
+/oneTBB/include/oneapi/tbb/detail/_flow_graph* @aleksei-fedotov
+/oneTBB/include/oneapi/tbb/detail/_concurrent* @kboyarinov
+/oneTBB/src/doc @aepanchi
+/oneTBB/src/tbbbind/ @isaevil
+/oneTBB/src/tbbmalloc/ @lplewa
+/oneTBB/src/tbbmalloc_proxy/ @lplewa
+/oneTBB/cmake/ @isaevil
+/oneTBB/*CMakeLists.txt @isaevil
+/oneTBB/python/ @sarathnandu
+/oneTBB/python/ @isaevil
+
+# Bazel build related files.
+/oneTBB/.bazelversion @Vertexwahn
+/oneTBB/Bazel.md @Vertexwahn
+/oneTBB/BUILD.bazel @Vertexwahn
+/oneTBB/MODULE.bazel @Vertexwahn
diff --git a/third-party/tbb/CODE_OF_CONDUCT.md b/third-party/tbb/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000..c169707396
--- /dev/null
+++ b/third-party/tbb/CODE_OF_CONDUCT.md
@@ -0,0 +1,134 @@
+
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, caste, color, religion, or sexual
+identity and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the overall
+  community
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or advances of
+  any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email address,
+  without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official email address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+oneTBBCodeOfConduct At intel DOT com.
+All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series of
+actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or permanent
+ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior, harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within the
+community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.1, available at
+[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
+
+Community Impact Guidelines were inspired by
+[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
+
+For answers to common questions about this code of conduct, see the FAQ at
+[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
+[https://www.contributor-covenant.org/translations][translations].
+
+[homepage]: https://www.contributor-covenant.org
+[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
+[Mozilla CoC]: https://github.com/mozilla/diversity
+[FAQ]: https://www.contributor-covenant.org/faq
+[translations]: https://www.contributor-covenant.org/translations
+
diff --git a/third-party/tbb/CONTRIBUTING.md b/third-party/tbb/CONTRIBUTING.md
index 3048b21199..b2b6a968cd 100644
--- a/third-party/tbb/CONTRIBUTING.md
+++ b/third-party/tbb/CONTRIBUTING.md
@@ -21,9 +21,7 @@ As an open source project, we welcome community contributions to oneAPI Threadin
 
 Licensing is very important to open source projects. It helps ensure the software continues to be available under the terms that the author desired. The oneTBB project uses the [Apache 2.0 License](https://github.com/oneapi-src/oneTBB/blob/master/LICENSE.txt), a permissive open source license that allows you to freely use, modify, and distribute your own products that include Apache 2.0 licensed software. By contributing to the oneTBB project, you agree to the license and copyright terms therein and release your own contributions under these terms. 
 
-Some imported or reused components within oneTBB use other licenses, as described in [third-party-programs.txt](https://github.com/oneapi-src/oneTBB/blob/master/third-party-programs.txt). By carefully reviewing potential contributions and enforcing a [Developer Certification of Origin (DCO)](https://developercertificate.org/) for contributed code, we can ensure that the community can develop products with oneTBB without concerns over patent or copyright issues. 
-
-The DCO is an attestation attached to every contribution made by every developer. In the commit message of the contribution, (described later), the developer simply adds a Signed-off-by statement and thereby agrees to the DCO. 
+Some imported or reused components within oneTBB use other licenses, as described in [third-party-programs.txt](https://github.com/oneapi-src/oneTBB/blob/master/third-party-programs.txt). By carefully reviewing potential contributions, we can ensure that the community can develop products with oneTBB without concerns over patent or copyright issues. 
 
 ## Prerequisites 
 
@@ -32,12 +30,6 @@ As a contributor, you’ll want to be familiar with the oneTBB project and the r
 ## Pull Requests 
 
 You can find all [open oneTBB pull requests](https://github.com/oneapi-src/oneTBB/pulls) on GitHub. 
-
-No anonymous contributions are accepted. The name in the commit message Signed-off-by line and your email must match the change authorship information.  Make sure your .gitconfig is set up correctly so you can use `git commit -s` for signing your patches: 
-
-`git config --global user.name "Taylor Developer"`
-
-`git config --global user.email taylor.developer@company.com`
  
 ### Before contributing changes directly to the oneTBB repository
 
diff --git a/third-party/tbb/MODULE.bazel b/third-party/tbb/MODULE.bazel
index cc6698f0de..063bc2f468 100644
--- a/third-party/tbb/MODULE.bazel
+++ b/third-party/tbb/MODULE.bazel
@@ -21,4 +21,4 @@ module(
     compatibility_level = 1,
 )
 
-bazel_dep(name = "platforms", version = "0.0.9")
+bazel_dep(name = "platforms", version = "0.0.10")
diff --git a/third-party/tbb/MODULE.bazel.lock b/third-party/tbb/MODULE.bazel.lock
new file mode 100644
index 0000000000..06f9098032
--- /dev/null
+++ b/third-party/tbb/MODULE.bazel.lock
@@ -0,0 +1,65 @@
+{
+  "lockFileVersion": 11,
+  "registryFileHashes": {
+    "https://bcr.bazel.build/bazel_registry.json": "8a28e4aff06ee60aed2a8c281907fb8bcbf3b753c91fb5a5c57da3215d5b3497",
+    "https://bcr.bazel.build/modules/abseil-cpp/20210324.2/MODULE.bazel": "7cd0312e064fde87c8d1cd79ba06c876bd23630c83466e9500321be55c96ace2",
+    "https://bcr.bazel.build/modules/abseil-cpp/20211102.0/MODULE.bazel": "70390338f7a5106231d20620712f7cccb659cd0e9d073d1991c038eb9fc57589",
+    "https://bcr.bazel.build/modules/abseil-cpp/20211102.0/source.json": "7e3a9adf473e9af076ae485ed649d5641ad50ec5c11718103f34de03170d94ad",
+    "https://bcr.bazel.build/modules/apple_support/1.5.0/MODULE.bazel": "50341a62efbc483e8a2a6aec30994a58749bd7b885e18dd96aa8c33031e558ef",
+    "https://bcr.bazel.build/modules/apple_support/1.5.0/source.json": "eb98a7627c0bc486b57f598ad8da50f6625d974c8f723e9ea71bd39f709c9862",
+    "https://bcr.bazel.build/modules/bazel_features/1.11.0/MODULE.bazel": "f9382337dd5a474c3b7d334c2f83e50b6eaedc284253334cf823044a26de03e8",
+    "https://bcr.bazel.build/modules/bazel_features/1.11.0/source.json": "c9320aa53cd1c441d24bd6b716da087ad7e4ff0d9742a9884587596edfe53015",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.0.3/MODULE.bazel": "bcb0fd896384802d1ad283b4e4eb4d718eebd8cb820b0a2c3a347fb971afd9d8",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.2.1/MODULE.bazel": "f35baf9da0efe45fa3da1696ae906eea3d615ad41e2e3def4aeb4e8bc0ef9a7a",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.3.0/MODULE.bazel": "20228b92868bf5cfc41bda7afc8a8ba2a543201851de39d990ec957b513579c5",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.6.1/MODULE.bazel": "8fdee2dbaace6c252131c00e1de4b165dc65af02ea278476187765e1a617b917",
+    "https://bcr.bazel.build/modules/bazel_skylib/1.6.1/source.json": "082ed5f9837901fada8c68c2f3ddc958bb22b6d654f71dd73f3df30d45d4b749",
+    "https://bcr.bazel.build/modules/buildozer/7.1.2/MODULE.bazel": "2e8dd40ede9c454042645fd8d8d0cd1527966aa5c919de86661e62953cd73d84",
+    "https://bcr.bazel.build/modules/buildozer/7.1.2/source.json": "c9028a501d2db85793a6996205c8de120944f50a0d570438fcae0457a5f9d1f8",
+    "https://bcr.bazel.build/modules/googletest/1.11.0/MODULE.bazel": "3a83f095183f66345ca86aa13c58b59f9f94a2f81999c093d4eeaa2d262d12f4",
+    "https://bcr.bazel.build/modules/googletest/1.11.0/source.json": "c73d9ef4268c91bd0c1cd88f1f9dfa08e814b1dbe89b5f594a9f08ba0244d206",
+    "https://bcr.bazel.build/modules/platforms/0.0.10/MODULE.bazel": "8cb8efaf200bdeb2150d93e162c40f388529a25852b332cec879373771e48ed5",
+    "https://bcr.bazel.build/modules/platforms/0.0.10/source.json": "f22828ff4cf021a6b577f1bf6341cb9dcd7965092a439f64fc1bb3b7a5ae4bd5",
+    "https://bcr.bazel.build/modules/platforms/0.0.4/MODULE.bazel": "9b328e31ee156f53f3c416a64f8491f7eb731742655a47c9eec4703a71644aee",
+    "https://bcr.bazel.build/modules/platforms/0.0.5/MODULE.bazel": "5733b54ea419d5eaf7997054bb55f6a1d0b5ff8aedf0176fef9eea44f3acda37",
+    "https://bcr.bazel.build/modules/platforms/0.0.6/MODULE.bazel": "ad6eeef431dc52aefd2d77ed20a4b353f8ebf0f4ecdd26a807d2da5aa8cd0615",
+    "https://bcr.bazel.build/modules/platforms/0.0.7/MODULE.bazel": "72fd4a0ede9ee5c021f6a8dd92b503e089f46c227ba2813ff183b71616034814",
+    "https://bcr.bazel.build/modules/platforms/0.0.9/MODULE.bazel": "4a87a60c927b56ddd67db50c89acaa62f4ce2a1d2149ccb63ffd871d5ce29ebc",
+    "https://bcr.bazel.build/modules/protobuf/21.7/MODULE.bazel": "a5a29bb89544f9b97edce05642fac225a808b5b7be74038ea3640fae2f8e66a7",
+    "https://bcr.bazel.build/modules/protobuf/21.7/source.json": "bbe500720421e582ff2d18b0802464205138c06056f443184de39fbb8187b09b",
+    "https://bcr.bazel.build/modules/protobuf/3.19.0/MODULE.bazel": "6b5fbb433f760a99a22b18b6850ed5784ef0e9928a72668b66e4d7ccd47db9b0",
+    "https://bcr.bazel.build/modules/protobuf/3.19.6/MODULE.bazel": "9233edc5e1f2ee276a60de3eaa47ac4132302ef9643238f23128fea53ea12858",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.1/MODULE.bazel": "cb2aa0747f84c6c3a78dad4e2049c154f08ab9d166b1273835a8174940365647",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.2/MODULE.bazel": "6915987c90970493ab97393024c156ea8fb9f3bea953b2f3ec05c34f19b5695c",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.8/MODULE.bazel": "964c85c82cfeb6f3855e6a07054fdb159aced38e99a5eecf7bce9d53990afa3e",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.9/MODULE.bazel": "836e76439f354b89afe6a911a7adf59a6b2518fafb174483ad78a2a2fde7b1c5",
+    "https://bcr.bazel.build/modules/rules_cc/0.0.9/source.json": "1f1ba6fea244b616de4a554a0f4983c91a9301640c8fe0dd1d410254115c8430",
+    "https://bcr.bazel.build/modules/rules_java/4.0.0/MODULE.bazel": "5a78a7ae82cd1a33cef56dc578c7d2a46ed0dca12643ee45edbb8417899e6f74",
+    "https://bcr.bazel.build/modules/rules_java/7.6.1/MODULE.bazel": "2f14b7e8a1aa2f67ae92bc69d1ec0fa8d9f827c4e17ff5e5f02e91caa3b2d0fe",
+    "https://bcr.bazel.build/modules/rules_java/7.6.1/source.json": "8f3f3076554e1558e8e468b2232991c510ecbcbed9e6f8c06ac31c93bcf38362",
+    "https://bcr.bazel.build/modules/rules_jvm_external/4.4.2/MODULE.bazel": "a56b85e418c83eb1839819f0b515c431010160383306d13ec21959ac412d2fe7",
+    "https://bcr.bazel.build/modules/rules_jvm_external/4.4.2/source.json": "a075731e1b46bc8425098512d038d416e966ab19684a10a34f4741295642fc35",
+    "https://bcr.bazel.build/modules/rules_license/0.0.3/MODULE.bazel": "627e9ab0247f7d1e05736b59dbb1b6871373de5ad31c3011880b4133cafd4bd0",
+    "https://bcr.bazel.build/modules/rules_license/0.0.7/MODULE.bazel": "088fbeb0b6a419005b89cf93fe62d9517c0a2b8bb56af3244af65ecfe37e7d5d",
+    "https://bcr.bazel.build/modules/rules_license/0.0.7/source.json": "355cc5737a0f294e560d52b1b7a6492d4fff2caf0bef1a315df5a298fca2d34a",
+    "https://bcr.bazel.build/modules/rules_pkg/0.7.0/MODULE.bazel": "df99f03fc7934a4737122518bb87e667e62d780b610910f0447665a7e2be62dc",
+    "https://bcr.bazel.build/modules/rules_pkg/0.7.0/source.json": "c2557066e0c0342223ba592510ad3d812d4963b9024831f7f66fd0584dd8c66c",
+    "https://bcr.bazel.build/modules/rules_proto/4.0.0/MODULE.bazel": "a7a7b6ce9bee418c1a760b3d84f83a299ad6952f9903c67f19e4edd964894e06",
+    "https://bcr.bazel.build/modules/rules_proto/5.3.0-21.7/MODULE.bazel": "e8dff86b0971688790ae75528fe1813f71809b5afd57facb44dad9e8eca631b7",
+    "https://bcr.bazel.build/modules/rules_proto/5.3.0-21.7/source.json": "d57902c052424dfda0e71646cb12668d39c4620ee0544294d9d941e7d12bc3a9",
+    "https://bcr.bazel.build/modules/rules_python/0.10.2/MODULE.bazel": "cc82bc96f2997baa545ab3ce73f196d040ffb8756fd2d66125a530031cd90e5f",
+    "https://bcr.bazel.build/modules/rules_python/0.22.1/MODULE.bazel": "26114f0c0b5e93018c0c066d6673f1a2c3737c7e90af95eff30cfee38d0bbac7",
+    "https://bcr.bazel.build/modules/rules_python/0.22.1/source.json": "57226905e783bae7c37c2dd662be078728e48fa28ee4324a7eabcafb5a43d014",
+    "https://bcr.bazel.build/modules/rules_python/0.4.0/MODULE.bazel": "9208ee05fd48bf09ac60ed269791cf17fb343db56c8226a720fbb1cdf467166c",
+    "https://bcr.bazel.build/modules/stardoc/0.5.1/MODULE.bazel": "1a05d92974d0c122f5ccf09291442580317cdd859f07a8655f1db9a60374f9f8",
+    "https://bcr.bazel.build/modules/stardoc/0.5.1/source.json": "a96f95e02123320aa015b956f29c00cb818fa891ef823d55148e1a362caacf29",
+    "https://bcr.bazel.build/modules/upb/0.0.0-20220923-a547704/MODULE.bazel": "7298990c00040a0e2f121f6c32544bab27d4452f80d9ce51349b1a28f3005c43",
+    "https://bcr.bazel.build/modules/upb/0.0.0-20220923-a547704/source.json": "f1ef7d3f9e0e26d4b23d1c39b5f5de71f584dd7d1b4ef83d9bbba6ec7a6a6459",
+    "https://bcr.bazel.build/modules/zlib/1.2.11/MODULE.bazel": "07b389abc85fdbca459b69e2ec656ae5622873af3f845e1c9d80fe179f3effa0",
+    "https://bcr.bazel.build/modules/zlib/1.2.12/MODULE.bazel": "3b1a8834ada2a883674be8cbd36ede1b6ec481477ada359cd2d3ddc562340b27",
+    "https://bcr.bazel.build/modules/zlib/1.3/MODULE.bazel": "6a9c02f19a24dcedb05572b2381446e27c272cd383aed11d41d99da9e3167a72",
+    "https://bcr.bazel.build/modules/zlib/1.3/source.json": "b6b43d0737af846022636e6e255fd4a96fee0d34f08f3830e6e0bac51465c37c"
+  },
+  "selectedYankedVersions": {},
+  "moduleExtensions": {}
+}
diff --git a/third-party/tbb/README.md b/third-party/tbb/README.md
index f2bc0a0afa..2e7c2e81ba 100644
--- a/third-party/tbb/README.md
+++ b/third-party/tbb/README.md
@@ -1,5 +1,8 @@
-# oneAPI Threading Building Blocks
+# oneAPI Threading Building Blocks (oneTBB) <img align="right" width="200" height="100" src="https://raw.githubusercontent.com/uxlfoundation/artwork/e98f1a7a3d305c582d02c5f532e41487b710d470/foundation/uxl-foundation-logo-horizontal-color.svg">
 [![Apache License Version 2.0](https://img.shields.io/badge/license-Apache_2.0-green.svg)](LICENSE.txt) [![oneTBB CI](https://github.com/oneapi-src/oneTBB/actions/workflows/ci.yml/badge.svg)](https://github.com/oneapi-src/oneTBB/actions/workflows/ci.yml?query=branch%3Amaster)
+[![Join the community on GitHub Discussions](https://badgen.net/badge/join%20the%20discussion/on%20github/blue?icon=github)](https://github.com/oneapi-src/oneTBB/discussions)
+[![OpenSSF Best Practices](https://www.bestpractices.dev/projects/9125/badge)](https://www.bestpractices.dev/projects/9125)
+[![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/oneapi-src/oneTBB/badge)](https://securityscorecards.dev/viewer/?uri=github.com/oneapi-src/oneTBB)
 
 oneTBB is a flexible C++ library that simplifies the work of adding parallelism
 to complex applications, even if you are not a threading expert.  
@@ -18,7 +21,7 @@ The library differs from typical threading packages in the following ways:
 
 Refer to oneTBB [examples](examples) and [samples](https://github.com/oneapi-src/oneAPI-samples/tree/master/Libraries/oneTBB) to see how you can use the library.
 
-oneTBB is a part of [oneAPI](https://oneapi.io). The current branch implements version 1.1 of oneAPI Specification.
+oneTBB is a part of the [UXL Foundation](http://www.uxlfoundation.org) and is an implementation of [oneAPI specification](https://oneapi.io).
 
 > **_NOTE:_** Threading Building Blocks (TBB) is now called oneAPI Threading Building Blocks (oneTBB) to highlight that the tool is a part of the oneAPI ecosystem.
 
@@ -39,6 +42,14 @@ See [Release Notes](RELEASE_NOTES.md) and [System Requirements](SYSTEM_REQUIREME
 ## Installation 
 See [Installation from Sources](INSTALL.md) to learn how to install oneTBB. 
 
+## Governance
+
+The oneTBB project is governed by the UXL Foundation.
+You can get involved in this project in following ways:
+* Join the [Open Source and Specification Working Group](https://github.com/uxlfoundation/foundation/tree/main?tab=readme-ov-file#working-groups) meetings.
+* Join the mailing lists for the [UXL Foundation](https://lists.uxlfoundation.org/g/main/subgroups) to receive meetings schedule and latest updates.
+* Contribute to oneTBB project or oneTBB specification. Read [CONTRIBUTING](./CONTRIBUTING.md) for more information.
+
 ## Support
 See our [documentation](./SUPPORT.md) to learn how to request help.
 
@@ -46,6 +57,8 @@ See our [documentation](./SUPPORT.md) to learn how to request help.
 We welcome community contributions, so check our [Contributing Guidelines](CONTRIBUTING.md)
 to learn more.
 
+Use GitHub Issues for feature requests, bug reports, and minor inquiries. For broader questions and development-related discussions, use GitHub Discussions.
+
 ## License
 oneAPI Threading Building Blocks is licensed under [Apache License, Version 2.0](LICENSE.txt).
 By its terms, contributions submitted to the project are also done under that license.
diff --git a/third-party/tbb/WASM_Support.md b/third-party/tbb/WASM_Support.md
index 8c2f6c1af9..6306620d7c 100644
--- a/third-party/tbb/WASM_Support.md
+++ b/third-party/tbb/WASM_Support.md
@@ -16,7 +16,7 @@
 
 # WASM Support
 
-oneTBB extends its capabilities by offering robust support for ``WASM``. 
+oneTBB extends its capabilities by offering robust support for ``WASM`` (see ``Limitation`` sections).
 
 ``WASM`` stands for WebAssembly, a low-level binary format for executing code in web browsers. 
 It is designed to be a portable target for compilers and efficient to parse and execute. 
@@ -58,3 +58,24 @@ To run tests, use:
 ctest
 ```
 
+# Limitations
+
+You can successfully build your application with oneTBB using WASM, but you may not achieve optimal performance immediately. This is due to the limitation with nested Web Workers: a Web Worker cannot schedule another worker without help from a browser thread. This can lead to unexpected performance outcomes, such as the application running in serial.
+Find more information in the [issue](https://github.com/emscripten-core/emscripten/discussions/21963) in the Emscripten repository.
+To workaround this issue, try one of the following ways:
+1. **Recommended Solution: Use the ``-sPROXY_TO_PTHREAD`` Flag**. 
+This flag splits the initial thread into a browser thread and a main thread (proxied by a Web Worker), effectively resolving the issue as the browser thread is always present in the event loop and can participate in Web Workers scheduling. Refer to the [Emscripten documentation](https://emscripten.org/docs/porting/pthreads.html) for more details about ``-sPROXY_TO_PTHREAD`` since using this flag may require refactoring the code.
+2. **Alternative Solution: Warm Up the oneTBB Thread Pool**
+Initialize the oneTBB thread pool before making the first call to oneTBB. This approach forces the browser thread to participate in Web Workers scheduling.
+```cpp
+    int num_threads = tbb::this_task_arena::max_concurrency();
+    std::atomic<int> barrier{num_threads};
+    tbb::parallel_for(0, num_threads, [&barrier] (int) {
+        barrier--;
+        while (barrier > 0) {
+            // Send browser thread to event loop
+            std::this_thread::yield();
+        }
+    }, tbb::static_partitioner{});
+```
+> **_NOTE:_** Be aware that it might cause delays on the browser side.
diff --git a/third-party/tbb/cmake/README.md b/third-party/tbb/cmake/README.md
index aa811b0fc0..3a357218d5 100644
--- a/third-party/tbb/cmake/README.md
+++ b/third-party/tbb/cmake/README.md
@@ -10,6 +10,7 @@ TBB_SANITIZE:STRING - Sanitizer parameter, passed to compiler/linker
 TBB_SIGNTOOL:FILEPATH - Tool for digital signing, used in post-install step for libraries if provided.
 TBB_SIGNTOOL_ARGS:STRING - Additional arguments for TBB_SIGNTOOL, used if TBB_SIGNTOOL is set.
 TBB_BUILD:BOOL - Enable Intel(R) oneAPI Threading Building Blocks (oneTBB) build (ON by default)
+TBB_FIND_PACKAGE - Enable search for external oneTBB using find_package instead of build from sources (OFF by default)
 TBBMALLOC_BUILD:BOOL - Enable Intel(R) oneAPI Threading Building Blocks (oneTBB) memory allocator build (ON by default)
 TBBMALLOC_PROXY_BUILD:BOOL - Enable Intel(R) oneAPI Threading Building Blocks (oneTBB) memory allocator proxy build (requires TBBMALLOC_BUILD. ON by default)
 TBB4PY_BUILD:BOOL - Enable Intel(R) oneAPI Threading Building Blocks (oneTBB) Python module build (OFF by default)
@@ -187,7 +188,14 @@ cmake --build . --target test # currently does not work on Windows* OS
 ```
 
 ## Installation
-See [Installation from Sources](../INSTALL.md) to learn how to install oneTBB. 
+See [Installation from Sources](../INSTALL.md) to learn how to install oneTBB.
+
+To install oneTBB from the release packages, use the following commands: 
+```bash
+tar -xvf oneapi-tbb-xxx.xx.x-*.tgz
+source env/vars.sh
+```
+
 
 ## Sanitizers - Configure, Build, and Run
 
diff --git a/third-party/tbb/cmake/compilers/Clang.cmake b/third-party/tbb/cmake/compilers/Clang.cmake
index f56b5fba0f..dcd66634f3 100644
--- a/third-party/tbb/cmake/compilers/Clang.cmake
+++ b/third-party/tbb/cmake/compilers/Clang.cmake
@@ -13,12 +13,16 @@
 # limitations under the License.
 
 if (EMSCRIPTEN)
-  set(TBB_EMSCRIPTEN 1)
-  set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -fexceptions)
-  set(TBB_TEST_LINK_FLAGS  ${TBB_COMMON_LINK_FLAGS} -fexceptions -sINITIAL_MEMORY=65536000 -sALLOW_MEMORY_GROWTH=1 -sEXIT_RUNTIME=1)
-  if (NOT EMSCRIPTEN_WITHOUT_PTHREAD)
-      set_property(TARGET Threads::Threads PROPERTY INTERFACE_LINK_LIBRARIES "-pthread")
-  endif()
+    set(TBB_EMSCRIPTEN 1)
+    set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -fexceptions)
+    set(TBB_TEST_LINK_FLAGS  ${TBB_COMMON_LINK_FLAGS} -fexceptions -sINITIAL_MEMORY=65536000 -sALLOW_MEMORY_GROWTH=1 -sMALLOC=mimalloc -sEXIT_RUNTIME=1)
+    if (NOT EMSCRIPTEN_WITHOUT_PTHREAD)
+        set_property(TARGET Threads::Threads PROPERTY INTERFACE_LINK_LIBRARIES "-pthread")
+    endif()
+    set(TBB_EMSCRIPTEN_STACK_SIZE 65536)
+    set(TBB_LIB_COMPILE_FLAGS -D__TBB_EMSCRIPTEN_STACK_SIZE=${TBB_EMSCRIPTEN_STACK_SIZE})
+    set(TBB_TEST_LINK_FLAGS ${TBB_TEST_LINK_FLAGS} -sTOTAL_STACK=${TBB_EMSCRIPTEN_STACK_SIZE})
+    unset(TBB_EMSCRIPTEN_STACK_SIZE)
 endif()
 
 if (MINGW)
diff --git a/third-party/tbb/cmake/compilers/GNU.cmake b/third-party/tbb/cmake/compilers/GNU.cmake
index 6fd8d9808d..cf6d8bdbca 100644
--- a/third-party/tbb/cmake/compilers/GNU.cmake
+++ b/third-party/tbb/cmake/compilers/GNU.cmake
@@ -35,8 +35,39 @@ if (NOT CMAKE_GENERATOR MATCHES "Ninja" AND NOT CMAKE_CXX_DEPENDS_USE_COMPILER)
     set(TBB_MMD_FLAG -MMD)
 endif()
 
+
+# Binutils < 2.31.1 do not support the tpause instruction. When compiling with
+# a modern version of GCC (supporting it) but relying on an outdated assembler,
+# will result in an error reporting "no such instruction: tpause".
+# The following code invokes the GNU assembler to extract the version number
+# and convert it to an integer that can be used in the C++ code to compare
+# against, and conditionally disable the __TBB_WAITPKG_INTRINSICS_PRESENT
+# macro if the version is incompatible. Binutils only report the version in the
+# MAJOR.MINOR format, therefore the version checked is >=2.32 (instead of
+# >=2.31.1). Capturing the output in CMake can be done like below. The version
+# information is written to either stdout or stderr. To not make any
+# assumptions, both are captured.
+execute_process(
+    COMMAND ${CMAKE_COMMAND} -E env "LANG=C" ${CMAKE_CXX_COMPILER} -xc -c /dev/null -Wa,-v -o/dev/null
+    OUTPUT_VARIABLE ASSEMBLER_VERSION_LINE_OUT
+    ERROR_VARIABLE ASSEMBLER_VERSION_LINE_ERR
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    ERROR_STRIP_TRAILING_WHITESPACE
+)
+set(ASSEMBLER_VERSION_LINE ${ASSEMBLER_VERSION_LINE_OUT}${ASSEMBLER_VERSION_LINE_ERR})
+string(REGEX REPLACE ".*GNU assembler version ([0-9]+)\\.([0-9]+).*" "\\1" _tbb_gnu_asm_major_version "${ASSEMBLER_VERSION_LINE}")
+string(REGEX REPLACE ".*GNU assembler version ([0-9]+)\\.([0-9]+).*" "\\2" _tbb_gnu_asm_minor_version "${ASSEMBLER_VERSION_LINE}")
+unset(ASSEMBLER_VERSION_LINE_OUT)
+unset(ASSEMBLER_VERSION_LINE_ERR)
+unset(ASSEMBLER_VERSION_LINE)
+message(TRACE "Extracted GNU assembler version: major=${_tbb_gnu_asm_major_version} minor=${_tbb_gnu_asm_minor_version}")
+
+math(EXPR _tbb_gnu_asm_version_number  "${_tbb_gnu_asm_major_version} * 1000 + ${_tbb_gnu_asm_minor_version}")
+set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} "-D__TBB_GNU_ASM_VERSION=${_tbb_gnu_asm_version_number}")
+message(STATUS "GNU Assembler version: ${_tbb_gnu_asm_major_version}.${_tbb_gnu_asm_minor_version}  (${_tbb_gnu_asm_version_number})")
+
 # Enable Intel(R) Transactional Synchronization Extensions (-mrtm) and WAITPKG instructions support (-mwaitpkg) on relevant processors
-if (CMAKE_SYSTEM_PROCESSOR MATCHES "(AMD64|amd64|i.86|x86)")
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "(AMD64|amd64|i.86|x86)" AND NOT EMSCRIPTEN)
     set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -mrtm $<$<AND:$<NOT:$<CXX_COMPILER_ID:Intel>>,$<NOT:$<VERSION_LESS:${CMAKE_CXX_COMPILER_VERSION},11.0>>>:-mwaitpkg>)
 endif()
 
@@ -51,6 +82,9 @@ if (NOT ${CMAKE_CXX_COMPILER_ID} STREQUAL Intel)
     # gcc 6.0 and later have -flifetime-dse option that controls elimination of stores done outside the object lifetime
     set(TBB_DSE_FLAG $<$<NOT:$<VERSION_LESS:${CMAKE_CXX_COMPILER_VERSION},6.0>>:-flifetime-dse=1>)
     set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} $<$<NOT:$<VERSION_LESS:${CMAKE_CXX_COMPILER_VERSION},8.0>>:-fstack-clash-protection>)
+
+    # Suppress GCC 12.x-13.x warning here that to_wait_node(n)->my_is_in_list might have size 0
+    set(TBB_COMMON_LINK_FLAGS ${TBB_COMMON_LINK_FLAGS} $<$<AND:$<NOT:$<VERSION_LESS:${CMAKE_CXX_COMPILER_VERSION},12.0>>,$<VERSION_LESS:${CMAKE_CXX_COMPILER_VERSION},14.0>>:-Wno-stringop-overflow>)
 endif()
 
 # Workaround for heavy tests and too many symbols in debug (rellocation truncated to fit: R_MIPS_CALL16)
diff --git a/third-party/tbb/cmake/compilers/IntelLLVM.cmake b/third-party/tbb/cmake/compilers/IntelLLVM.cmake
index a9ebb3e670..b514378164 100644
--- a/third-party/tbb/cmake/compilers/IntelLLVM.cmake
+++ b/third-party/tbb/cmake/compilers/IntelLLVM.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023 Intel Corporation
+# Copyright (c) 2020-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -20,6 +20,9 @@ if (WIN32)
 else()
     include(${CMAKE_CURRENT_LIST_DIR}/Clang.cmake)
     set(TBB_IPO_COMPILE_FLAGS $<$<NOT:$<CONFIG:Debug>>:-ipo>)
+     # "--exclude-libs,ALL" is used to avoid accidental exporting of symbols
+    #  from statically linked libraries
+    set(TBB_LIB_LINK_FLAGS ${TBB_LIB_LINK_FLAGS} -static-intel -Wl,--exclude-libs,ALL)
     set(TBB_OPENMP_FLAG -qopenmp)
 endif()
 set(TBB_IPO_LINK_FLAGS ${TBB_IPO_LINK_FLAGS} ${TBB_IPO_COMPILE_FLAGS})
diff --git a/third-party/tbb/doc/GSG/get_started.rst b/third-party/tbb/doc/GSG/get_started.rst
index d437ce89b8..2af04be6b0 100644
--- a/third-party/tbb/doc/GSG/get_started.rst
+++ b/third-party/tbb/doc/GSG/get_started.rst
@@ -8,11 +8,36 @@ It is helpful for new users of parallel programming and experienced developers t
 
 It is recommended for you to have a basic knowledge of C++ programming and some experience with parallel programming concepts. 
 
+|full_name| is a runtime-based parallel programming model for C++ code that uses tasks.
+The template-based runtime library can help you harness the latent performance of multi-core processors.
+
+oneTBB enables you to simplify parallel programming by breaking computation into parallel running tasks. Within a single process, 
+parallelism is carried out by mapping tasks to threads. Threads are an operating system mechanism that allows the same or different sets of instructions 
+to be executed simultaneously. Using threads can make your program work faster and more efficiently.
+
+Here you can see one of the possible executions of tasks by threads.
+
+.. figure:: Images/how-oneTBB-works.png
+   :scale: 70%
+   :align: center
+
+Use oneTBB to write scalable applications that:
+
+* Specify logical parallel structure instead of threads.
+* Emphasize data-parallel programming.
+* Take advantage of concurrent collections and parallel algorithms.
+
+oneTBB supports nested parallelism and load balancing. It means that you can use the library without worrying about oversubscribing a system, which happens when more tasks are assigned to a system than it can handle efficiently. 
+
+oneTBB is used in different areas, such as scientific simulations, gaming, data analysis, etc. 
+
+It is available as a stand-alone product and as part of the |base_tk|.
+
 
 To start using oneTBB, follow the next steps:
 *********************************************
 
-#. Learn what :ref:`oneTBB is<intro>` and see the :ref:`System Requirements<system_requirements>`. 
+#. See the :ref:`System Requirements<system_requirements>`. 
 #. :ref:`Install oneTBB<installation>`.
 #. Run your program using oneTBB following the :ref:`Next Steps <next_steps>`. 
 #. Learn how to :ref:`Integrate oneTBB into your project <integrate>` using CMake* and pkg-config tool. 
diff --git a/third-party/tbb/doc/conf.py b/third-party/tbb/doc/conf.py
index 19da0a4caf..00dfed0e7f 100644
--- a/third-party/tbb/doc/conf.py
+++ b/third-party/tbb/doc/conf.py
@@ -29,7 +29,7 @@
     project = u'Intel® oneAPI Threading Building Blocks (oneTBB)'
 else:
     project = u'oneTBB'
-copyright = u'2023, Intel Corporation'
+copyright = u'Intel Corporation'
 author = u'Intel'
 
 # The short X.Y version
@@ -140,9 +140,7 @@
     }
 
 if BUILD_TYPE != 'oneapi' and BUILD_TYPE != 'dita':
-   html_theme_options = {
-    "extra_footer": "<div><a href='https://www.intel.com/content/www/us/en/privacy/intel-cookie-notice.html' data-cookie-notice='true'>Cookies</a> <a href='https://www.intel.com/content/www/us/en/privacy/intel-privacy-notice.html'>| Privacy</a> <a data-wap_ref='dns' id='wap_dns' href='https://www.intel.com/content/www/us/en/privacy/intel-cookie- notice.html'>| Do Not Share My Personal Information</a> </div><div>&copy; Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), <a href='http://opensource.org/licenses/0BSD'>http://opensource.org/licenses/0BSD</a>. </div><br><div>oneTBB is licensed under Apache License Version 2.0. Refer to the <a href='https://github.com/oneapi-src/oneTBB/blob/master/LICENSE.txt'>LICENSE </a> file for the full license text and copyright notice.</div>"
-   }
+   html_theme_options["extra_footer"]="<div><a href='https://www.intel.com/content/www/us/en/privacy/intel-cookie-notice.html' data-cookie-notice='true'>Cookies</a> <a href='https://www.intel.com/content/www/us/en/privacy/intel-privacy-notice.html'>| Privacy</a> <a data-wap_ref='dns' id='wap_dns' href='https://www.intel.com/content/www/us/en/privacy/intel-cookie- notice.html'>| Do Not Share My Personal Information</a> </div><div>&copy; Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), <a href='http://opensource.org/licenses/0BSD'>http://opensource.org/licenses/0BSD</a>. </div><br><div>oneTBB is licensed under Apache License Version 2.0. Refer to the <a href='https://github.com/oneapi-src/oneTBB/blob/master/LICENSE.txt'>LICENSE </a> file for the full license text and copyright notice.</div>"
 
     
 # Add any paths that contain custom static files (such as style sheets) here,
@@ -159,11 +157,7 @@
 else:
     html_js_files = ['custom.js']
 
-html_theme_options = { 
-    "logo": {
-        "text": "oneTBB Documentation",
-    }
-}
+html_theme_options["logo"] = {"text": "oneTBB Documentation"}
     
 html_logo = '_static/oneAPI-rgb-rev-100.png'
 html_favicon = '_static/favicons.png'
@@ -304,7 +298,7 @@
 # -- Options for intersphinx extension ---------------------------------------
 
 # Example configuration for intersphinx: refer to the Python standard library.
-intersphinx_mapping = {'https://docs.python.org/': None}
+intersphinx_mapping = {'python': ('https://docs.python.org/3', None)}
 
 # -- Options for todo extension ----------------------------------------------
 
diff --git a/third-party/tbb/doc/index/toctree.rst b/third-party/tbb/doc/index/toctree.rst
index fba9aee46c..542a4bb601 100644
--- a/third-party/tbb/doc/index/toctree.rst
+++ b/third-party/tbb/doc/index/toctree.rst
@@ -17,7 +17,6 @@
    :maxdepth: 2
 
    /GSG/get_started
-   /GSG/intro
    /GSG/system_requirements
    /GSG/installation
    /GSG/next_steps 
diff --git a/third-party/tbb/doc/main/reference/custom_mutex_chmap.rst b/third-party/tbb/doc/main/reference/custom_mutex_chmap.rst
index 152320fd65..acf502e66d 100644
--- a/third-party/tbb/doc/main/reference/custom_mutex_chmap.rst
+++ b/third-party/tbb/doc/main/reference/custom_mutex_chmap.rst
@@ -50,7 +50,7 @@ Type requirements
 -----------------
 
 The type of the mutex passed as a template argument for ``concurrent_hash_map`` should
-meet the requirements of `ReaderWriterMutex <https://spec.oneapi.com/versions/latest/elements/oneTBB/source/named_requirements/mutexes/rw_mutex.html>`_.
+meet the requirements of `ReaderWriterMutex <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/named_requirements/mutexes/rw_mutex>`_.
 It should also provide the following API:
 
 .. cpp:function:: bool ReaderWriterMutex::scoped_lock::is_writer() const;
diff --git a/third-party/tbb/doc/main/reference/parallel_for_each_semantics.rst b/third-party/tbb/doc/main/reference/parallel_for_each_semantics.rst
index 1f8815b3b3..c007066b3b 100644
--- a/third-party/tbb/doc/main/reference/parallel_for_each_semantics.rst
+++ b/third-party/tbb/doc/main/reference/parallel_for_each_semantics.rst
@@ -10,7 +10,7 @@ parallel_for_each Body semantics and requirements
 Description
 ***********
 
-This page clarifies `ParallelForEachBody <https://spec.oneapi.com/versions/latest/elements/oneTBB/source/named_requirements/algorithms/par_for_each_body.html>`_
+This page clarifies `ParallelForEachBody <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/named_requirements/algorithms/par_for_each_body>`_
 named requirements for ``tbb::parallel_for_each`` algorithm specification.
 
 .. code:: cpp
diff --git a/third-party/tbb/doc/main/reference/parallel_sort_ranges_extension.rst b/third-party/tbb/doc/main/reference/parallel_sort_ranges_extension.rst
index 52f2283a17..cad65b54b0 100644
--- a/third-party/tbb/doc/main/reference/parallel_sort_ranges_extension.rst
+++ b/third-party/tbb/doc/main/reference/parallel_sort_ranges_extension.rst
@@ -10,7 +10,7 @@ parallel_sort ranges interface extension
 Description
 ***********
 
-|full_name| implementation extends the `oneapi::tbb::parallel_sort specification <https://spec.oneapi.io/versions/latest/elements/oneTBB/source/algorithms/functions/parallel_sort_func.html>`_
+|full_name| implementation extends the `oneapi::tbb::parallel_sort specification <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/algorithms/functions/parallel_sort_func>`_
 with overloads that takes the container by forwarding reference.
 
 
diff --git a/third-party/tbb/doc/main/reference/reference.rst b/third-party/tbb/doc/main/reference/reference.rst
index 833a50ee70..4c293c02c7 100644
--- a/third-party/tbb/doc/main/reference/reference.rst
+++ b/third-party/tbb/doc/main/reference/reference.rst
@@ -3,13 +3,13 @@
 |short_name| API Reference
 ==========================
 
-For oneTBB API Reference, refer to `oneAPI Specification <https://spec.oneapi.com/>`_. The current supported
+For oneTBB API Reference, refer to `oneAPI Specification <https://github.com/uxlfoundation/oneAPI-spec>`_. The current supported
 version of oneAPI Specification is 1.0.
 
 Specification extensions
 ************************
 
-|full_name| implements the `oneTBB specification <https://spec.oneapi.com/versions/latest/elements/oneTBB/source/nested-index.html>`_.
+|full_name| implements the `oneTBB specification <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/nested-index>`_.
 This document provides additional details or restrictions where necessary.
 It also describes features that are not included in the oneTBB specification.
 
@@ -50,3 +50,4 @@ The key properties of a preview feature are:
     concurrent_lru_cache_cls
     task_group_extensions
     custom_mutex_chmap
+    try_put_and_wait
diff --git a/third-party/tbb/doc/main/reference/rvalue_reduce.rst b/third-party/tbb/doc/main/reference/rvalue_reduce.rst
index 53880952aa..7cf66d86b3 100644
--- a/third-party/tbb/doc/main/reference/rvalue_reduce.rst
+++ b/third-party/tbb/doc/main/reference/rvalue_reduce.rst
@@ -10,8 +10,8 @@ Parallel Reduction for rvalues
 Description
 ***********
 
-|full_name| implementation extends the `ParallelReduceFunc <https://spec.oneapi.io/versions/latest/elements/oneTBB/source/named_requirements/algorithms/par_reduce_func.html>`_ and
-`ParallelReduceReduction <https://spec.oneapi.io/versions/latest/elements/oneTBB/source/named_requirements/algorithms/par_reduce_reduction.html>`_
+|full_name| implementation extends the `ParallelReduceFunc <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/named_requirements/algorithms/par_reduce_func>`_ and
+`ParallelReduceReduction <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/named_requirements/algorithms/par_reduce_reduction>`_
 to optimize operating with ``rvalues`` using functional form of ``tbb::parallel_reduce`` and ``tbb::parallel_deterministic_reduce`` algorithms.
 
 API
@@ -33,8 +33,9 @@ or
 
 .. cpp:function:: Value Func::operator()(const Range& range, const Value& x) const
 
-    Accumulates the result for a subrange, starting with initial value ``x``. The ``Range`` type must meet the `Range requirements <https://spec.oneapi.io/versions/latest/elements/oneTBB/source/named_requirements/algorithms/range.html>_`.
-    The ``Value`` type must be the same as a corresponding template parameter for the `parallel_reduce algorithm <https://spec.oneapi.io/versions/latest/elements/oneTBB/source/algorithms/functions/parallel_reduce_func.html>`_.
+    Accumulates the result for a subrange, starting with initial value ``x``. The ``Range`` type must meet the 
+    `Range requirements <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/named_requirements/algorithms/range>`_. 
+    The ``Value`` type must be the same as a corresponding template parameter for the `parallel_reduce algorithm <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/algorithms/functions/parallel_reduce_func>`_.
 
     If both ``rvalue`` and ``lvalue`` forms are provided, the ``rvalue`` is preferred.
 
@@ -47,7 +48,7 @@ or
 
 .. cpp:function:: Value Reduction::operator()(const Value& x, const Value& y) const
 
-    Combines the ``x`` and ``y`` results. The ``Value`` type must be the same as a corresponding template parameter for the `parallel_reduce algorithm <https://spec.oneapi.io/versions/latest/elements/oneTBB/source/algorithms/functions/parallel_reduce_func.html>`_.
+    Combines the ``x`` and ``y`` results. The ``Value`` type must be the same as a corresponding template parameter for the `parallel_reduce algorithm <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/algorithms/functions/parallel_reduce_func>`_.
 
     If both ``rvalue`` and ``lvalue`` forms are provided, the ``rvalue`` is preferred.
 
@@ -55,6 +56,7 @@ Example
 *******
 
 .. code:: cpp
+    
     // C++17
     #include <oneapi/tbb/parallel_reduce.h>
     #include <oneapi/tbb/blocked_range.h>
@@ -83,7 +85,7 @@ Example
 
 .. rubric:: See also
 
-* `oneapi::tbb::parallel_reduce specification <https://spec.oneapi.io/versions/latest/elements/oneTBB/source/algorithms/functions/parallel_reduce_func.html>`_
-* `oneapi::tbb::parallel_deterministic_reduce specification <https://spec.oneapi.io/versions/latest/elements/oneTBB/source/algorithms/functions/parallel_deterministic_reduce_func.html>`_
-* `ParallelReduceFunc specification <https://spec.oneapi.io/versions/latest/elements/oneTBB/source/named_requirements/algorithms/par_reduce_func.html>`_
-* `ParallelReduceReduction specification <https://spec.oneapi.io/versions/latest/elements/oneTBB/source/named_requirements/algorithms/par_reduce_reduction.html>`_
+* `oneapi::tbb::parallel_reduce specification <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/algorithms/functions/parallel_reduce_func>`_
+* `oneapi::tbb::parallel_deterministic_reduce specification <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/algorithms/functions/parallel_deterministic_reduce_func>`_
+* `ParallelReduceFunc specification <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/named_requirements/algorithms/par_reduce_func>`_
+* `ParallelReduceReduction specification <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/named_requirements/algorithms/par_reduce_reduction>`_
diff --git a/third-party/tbb/doc/main/reference/task_group_extensions.rst b/third-party/tbb/doc/main/reference/task_group_extensions.rst
index 10d3980161..47795f9574 100644
--- a/third-party/tbb/doc/main/reference/task_group_extensions.rst
+++ b/third-party/tbb/doc/main/reference/task_group_extensions.rst
@@ -13,7 +13,7 @@ task_group extensions
 Description
 ***********
 
-|full_name| implementation extends the `tbb::task_group specification <https://spec.oneapi.com/versions/latest/elements/oneTBB/source/task_scheduler/task_group/task_group_cls.html>`_ with the requirements for a user-provided function object.
+|full_name| implementation extends the `tbb::task_group specification <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/task_scheduler/task_group/task_group_cls>`_ with the requirements for a user-provided function object.
    
 
 API
@@ -83,7 +83,7 @@ As an optimization hint, ``F`` might return a ``task_handle``, which task object
                
 .. rubric:: See also
 
-* `oneapi::tbb::task_group specification <https://spec.oneapi.com/versions/latest/elements/oneTBB/source/task_scheduler/task_group/task_group_cls.html>`_
-* `oneapi::tbb::task_group_context specification <https://spec.oneapi.com/versions/latest/elements/oneTBB/source/task_scheduler/scheduling_controls/task_group_context_cls.html>`_
-* `oneapi::tbb::task_group_status specification <https://spec.oneapi.com/versions/latest/elements/oneTBB/source/task_scheduler/task_group/task_group_status_enum.html>`_ 
-* `oneapi::tbb::task_handle class <https://oneapi-src.github.io/oneAPI-spec/spec/elements/oneTBB/source/task_scheduler/task_group/task_handle.html>`_
+* `oneapi::tbb::task_group specification <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/task_scheduler/task_group/task_group_cls>`_
+* `oneapi::tbb::task_group_context specification <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/task_scheduler/scheduling_controls/task_group_context_cls>`_
+* `oneapi::tbb::task_group_status specification <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/task_scheduler/task_group/task_group_status_enum>`_ 
+* `oneapi::tbb::task_handle class <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/task_scheduler/task_group/task_handle>`_
diff --git a/third-party/tbb/doc/main/reference/try_put_and_wait.rst b/third-party/tbb/doc/main/reference/try_put_and_wait.rst
new file mode 100644
index 0000000000..4e05961f39
--- /dev/null
+++ b/third-party/tbb/doc/main/reference/try_put_and_wait.rst
@@ -0,0 +1,324 @@
+.. _try_put_and_wait:
+
+Waiting for Single Messages in Flow Graph
+=========================================
+
+.. contents::
+    :local:
+    :depth: 1
+
+Description
+***********
+
+This feature adds a new ``try_put_and_wait`` interface to the receiving nodes in the Flow Graph.
+This function puts a message as an input into a Flow Graph and waits until all work related to
+that message is complete.
+``try_put_and_wait`` may reduce latency compared to calling ``graph::wait_for_all`` since
+``graph::wait_for_all`` waits for all work, including work that is unrelated to the input message, to complete.
+
+``node.try_put_and_wait(msg)`` performs ``node.try_put(msg)`` on the node and waits until the work on ``msg`` is completed.
+Therefore, the following conditions are true:
+
+* Any task initiated by any node in the Flow Graph that involves working with ``msg`` or any other intermediate result
+  computed from ``msg`` is completed.
+* No intermediate results computed from ``msg`` remain in any buffers in the graph.
+
+.. caution::
+
+    To prevent ``try_put_and_wait`` calls from infinite waiting, avoid using buffering nodes at the end of the Flow Graph since the final result
+    will not be automatically consumed by the Flow Graph.
+
+.. caution::
+
+    The ``multifunction_node`` and ``async_node`` classes are not currently supported by this feature. Including one of these nodes in the
+    Flow Graph may cause ``try_put_and_wait`` to exit early, even if the computations on the initial input message are
+    still in progress.
+
+API
+***
+
+Header
+------
+
+.. code:: cpp
+
+    #define TBB_PREVIEW_FLOW_GRAPH_FEATURES // macro option 1
+    #define TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT // macro option 2
+    #include <oneapi/tbb/flow_graph.h>
+
+Synopsis
+--------
+
+.. code:: cpp
+
+    namespace oneapi {
+        namespace tbb {
+            template <typename Output, typename Policy = /*default-policy*/>
+            class continue_node {
+            public:
+                bool try_put_and_wait(const continue_msg& input);
+            }; // class continue_node
+
+            template <typename Input, typename Output = continue_msg, typename Policy = /*default-policy*/>
+            class function_node {
+            public:
+                bool try_put_and_wait(const Input& input);
+            }; // class function_node
+
+            template <typename T>
+            class overwrite_node {
+            public:
+                bool try_put_and_wait(const T& input);
+            }; // class overwrite_node
+
+            template <typename T>
+            class write_once_node {
+            public:
+                bool try_put_and_wait(const T& input);
+            }; // class write_once_node
+
+            template <typename T>
+            class buffer_node {
+            public:
+                bool try_put_and_wait(const T& input);
+            }; // class buffer_node
+
+            template <typename T>
+            class queue_node {
+            public:
+                bool try_put_and_wait(const T& input);
+            }; // class queue_node
+
+            template <typename T, typename Compare = std::less<T>>
+            class priority_queue_node {
+            public:
+                bool try_put_and_wait(const T& input);
+            }; // class priority_queue_node
+
+            template <typename T>
+            class sequencer_node {
+            public:
+                bool try_put_and_wait(const T& input);
+            }; // class sequencer_node
+
+            template <typename T, typename DecrementType = continue_msg>
+            class limiter_node {
+            public:
+                bool try_put_and_wait(const T& input);
+            }; // class limiter_node
+
+            template <typename T>
+            class broadcast_node {
+            public:
+                bool try_put_and_wait(const T& input);
+            }; // class broadcast_node
+
+            template <typename TupleType>
+            class split_node {
+            public:
+                bool try_put_and_wait(const TupleType& input);
+            }; // class split_node
+        } // namespace tbb
+    } // namespace oneapi
+
+Member Functions
+----------------
+
+.. code:: cpp
+
+    template <typename Output, typename Policy>
+    bool continue_node<Output, Policy>::try_put_and_wait(const continue_msg& input)
+
+**Effects**: Increments the count of input signals received. If the incremented count is equal to the number
+of known predecessors, performs the ``body`` function object execution.
+
+Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and
+related to ``input`` are executed, and no related objects remain in any buffer within the graph.
+
+**Returns**: ``true``.
+
+.. code:: cpp
+
+    template <typename Input, typename Output, typename Policy>
+    bool function_node<Input, Output, Policy>::try_put_and_wait(const Input& input)
+
+**Effects**: If the concurrency limit allows, executes the user-provided body on the incoming message ``input``.
+Otherwise, depending on the ``Policy`` of the node, either queues the incoming message ``input`` or rejects it.
+
+Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and
+related to ``input`` are executed, and no related objects remain in any buffer within the graph.
+
+**Returns**: ``true`` if the input is accepted, ``false`` otherwise.
+
+.. code:: cpp
+
+    template <typename T>
+    bool overwrite_node<T>::try_put_and_wait(const T& input)
+
+**Effects**: Stores ``input`` in the internal single-item buffer and broadcasts it to all successors.
+
+Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and
+related to ``input`` are executed, and no related objects remain in any buffer within the graph.
+
+**Returns**: ``true``.
+
+.. caution::
+
+    Since the input element is not retrieved from ``overwrite_node`` once accepted by the successor,
+    retrieve it by explicitly calling the ``clear()`` method or by overwriting with another element to prevent
+    ``try_put_and_wait`` from indefinite waiting.
+
+.. code:: cpp
+
+    template <typename T>
+    bool write_once_node<T>::try_put_and_wait(const T& input)
+
+**Effects**: Stores ``input`` in the internal single-item buffer if it does not contain a valid value already.
+If a new value is set, the node broadcasts it to all successors.
+
+Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and
+related to ``input`` are executed, and no related objects remain in any buffer within the graph.
+
+**Returns**: ``true`` for the first time after construction or a call to ``clear()``.
+
+.. caution::
+
+    Since the input element is not retrieved from the ``write_once_node`` once accepted by the successor,
+    retrieve it by explicitly calling the ``clear()`` method to prevent ``try_put_and_wait`` from indefinite waiting.
+
+.. code:: cpp
+
+    template <typename T>
+    bool buffer_node<T>::try_put_and_wait(const T& input)
+
+**Effects**: Adds ``input`` to the set of items managed by the node and tries forwarding it to a successor.
+
+Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and
+related to ``input`` are executed, and no related objects remain in any buffer within the graph.
+
+**Returns**: ``true``.
+
+.. code:: cpp
+
+    template <typename T>
+    bool queue_node<T>::try_put_and_wait(const T& input)
+
+**Effects**: Adds ``input`` to the set of items managed by the node and tries forwarding the least recently added item
+to a successor.
+
+Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and
+related to ``input`` are executed, and no related objects remain in any buffer within the graph.
+
+**Returns**: ``true``.
+
+.. code:: cpp
+
+    template <typename T, typename Compare>
+    bool priority_queue_node<T>::try_put_and_wait(const T& input)
+
+**Effects**: Adds ``input`` to the ``priority_queue_node`` and attempts to forward the item with the highest
+priority among all items added to the node but not yet forwarded to the successors.
+
+Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and
+related to ``input`` are executed, and no related objects remain in any buffer within the graph.
+
+**Returns**: ``true``.
+
+.. code:: cpp
+
+    template <typename T>
+    bool sequencer_node<T>::try_put_and_wait(const T& input)
+
+**Effects**: Adds ``input`` to the ``sequencer_node`` and tries forwarding the next item in sequence to a successor.
+
+Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and
+related to ``input`` are executed, and no related objects remain in any buffer within the graph.
+
+**Returns**: ``true``.
+
+.. code:: cpp
+
+    template <typename T, typename DecrementType>
+    bool limiter_node<T, DecrementType>::try_put_and_wait(const T& input)
+
+**Effects**: If the broadcast count is below the threshold, broadcasts ``input`` to all successors.
+
+Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and
+related to ``input`` are executed, and no related objects remain in any buffer within the graph.
+
+**Returns**: ``true`` if ``input`` is broadcasted; ``false`` otherwise.
+
+.. code:: cpp
+
+    template <typename T>
+    bool broadcast_node<T>::try_put_and_wait(const T& input)
+
+**Effects**: Broadcasts ``input`` to all successors.
+
+Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and
+related to ``input`` are executed, and no related objects remain in any buffer within the graph.
+
+**Returns**: ``true`` even if the node cannot successfully forward the message to any of its successors.
+
+.. code:: cpp
+
+    template <typename TupleType>
+    bool split_node<TupleType>::try_put_and_wait(const TupleType& input);
+
+**Effects**: Broadcasts each element in the incoming tuple to the nodes connected to the ``split_node`` output ports.
+The element at index ``i`` of ``input`` is broadcasted through the output port number ``i``.
+
+Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and
+related to ``input`` are executed, and no related objects remain in any buffer within the graph.
+
+**Returns**: ``true``.
+
+Example
+*******
+
+.. code:: cpp
+
+    #define TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    #include <oneapi/tbb/flow_graph.h>
+    #include <oneapi/tbb/parallel_for.h>
+
+    struct f1_body;
+    struct f2_body;
+    struct f3_body;
+    struct f4_body;
+
+    int main() {
+        using namespace oneapi::tbb;
+
+        flow::graph g;
+        flow::broadcast_node<int> start_node(g);
+
+        flow::function_node<int, int> f1(g, flow::unlimited, f1_body{});
+        flow::function_node<int, int> f2(g, flow::unlimited, f2_body{});
+        flow::function_node<int, int> f3(g, flow::unlimited, f3_body{});
+
+        flow::join_node<std::tuple<int, int>> join(g);
+
+        flow::function_node<std::tuple<int, int>, int> f4(g, flow::serial, f4_body{});
+
+        flow::make_edge(start_node, f1);
+        flow::make_edge(f1, f2);
+
+        flow::make_edge(start_node, f3);
+
+        flow::make_edge(f2, flow::input_port<0>(join));
+        flow::make_edge(f3, flow::input_port<1>(join));
+
+        flow::make_edge(join, f4);
+
+        // Submit work into the graph
+        parallel_for(0, 100, [](int input) {
+            start_node.try_put_and_wait(input);
+
+            // Post processing the result of input
+        });
+    }
+
+Each iteration of ``parallel_for`` submits an input into the Flow Graph. After returning from ``try_put_and_wait(input)``, it is
+guaranteed that all of the work related to the completion of ``input`` is done by all of the nodes in the graph. Tasks related to inputs
+submitted by other calls are not guaranteed to be completed.
diff --git a/third-party/tbb/doc/main/reference/type_specified_message_keys.rst b/third-party/tbb/doc/main/reference/type_specified_message_keys.rst
index 3b1dbc56fa..a50cd7f434 100644
--- a/third-party/tbb/doc/main/reference/type_specified_message_keys.rst
+++ b/third-party/tbb/doc/main/reference/type_specified_message_keys.rst
@@ -66,4 +66,4 @@ lookup and used in place of the default implementation.
 See Also
 ********
 
-`join_node Specification <https://spec.oneapi.com/versions/latest/elements/oneTBB/source/flow_graph/join_node_cls.html>`_
+`join_node Specification <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/flow_graph/join_node_cls>`_
diff --git a/third-party/tbb/doc/main/tbb_userguide/Constraints.rst b/third-party/tbb/doc/main/tbb_userguide/Constraints.rst
index d37ce12028..1928fe8eeb 100644
--- a/third-party/tbb/doc/main/tbb_userguide/Constraints.rst
+++ b/third-party/tbb/doc/main/tbb_userguide/Constraints.rst
@@ -4,7 +4,7 @@ Constrained APIs
 ================
 
 Starting from C++20, most of |full_name| APIs are constrained to
-enforce `named requirements <https://spec.oneapi.com/versions/latest/elements/oneTBB/source/named_requirements.html>`_ on
+enforce `named requirements <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/named_requirements>`_ on
 template arguments types.
 
 The violations of these requirements are detected at a compile time during the template instantiation.
diff --git a/third-party/tbb/doc/main/tbb_userguide/Exceptions_and_Cancellation.rst b/third-party/tbb/doc/main/tbb_userguide/Exceptions_and_Cancellation.rst
index 724b8b6ec9..290f2f2cc3 100644
--- a/third-party/tbb/doc/main/tbb_userguide/Exceptions_and_Cancellation.rst
+++ b/third-party/tbb/doc/main/tbb_userguide/Exceptions_and_Cancellation.rst
@@ -22,14 +22,11 @@ the following steps generally occur:
    thread that invoked the algorithm.
 
 
-The exception thrown in step 3 might be the original exception, or might
-merely be a summary of type ``captured_exception``. The latter usually
-occurs on current systems because propagating exceptions between threads
-requires support for the C++ ``std::exception_ptr`` functionality. As
-compilers evolve to support this functionality, future versions of
+As compilers evolve to support this functionality, future versions of
 oneTBB might throw the original exception. So be sure your code can
 catch either type of exception. The following example demonstrates
-exception handling.
+exception handling:
+
 
 
 ::
diff --git a/third-party/tbb/doc/main/tbb_userguide/How_Task_Scheduler_Works.rst b/third-party/tbb/doc/main/tbb_userguide/How_Task_Scheduler_Works.rst
index 5ad1670baa..744794fc07 100644
--- a/third-party/tbb/doc/main/tbb_userguide/How_Task_Scheduler_Works.rst
+++ b/third-party/tbb/doc/main/tbb_userguide/How_Task_Scheduler_Works.rst
@@ -7,7 +7,7 @@ How Task Scheduler Works
 While the task scheduler is not bound to any particular type of parallelism, 
 it was designed to work efficiently for fork-join parallelism with lots of forks.
 This type of parallelism is typical for parallel algorithms such as `oneapi::tbb::parallel_for
-<https://spec.oneapi.io/versions/latest/elements/oneTBB/source/algorithms/functions/parallel_for_func.html>`_.
+<https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/algorithms/functions/parallel_for_func>`_.
 
 Let's consider the mapping of fork-join parallelism on the task scheduler in more detail. 
 
diff --git a/third-party/tbb/doc/main/tbb_userguide/Linux_OS.rst b/third-party/tbb/doc/main/tbb_userguide/Linux_OS.rst
index 1d25a04dcd..0f0c245720 100644
--- a/third-party/tbb/doc/main/tbb_userguide/Linux_OS.rst
+++ b/third-party/tbb/doc/main/tbb_userguide/Linux_OS.rst
@@ -25,12 +25,12 @@ structure for Linux\*, relative to *<tbb_install_dir>*
       - | ``LIBRARY_PATH``
 	| ``LD_LIBRARY_PATH``
 
-where
+Where:
 
 * ``<arch>`` - ``ia32`` or ``intel64``
+  
+   .. note:: Starting with oneTBB 2022.0, 32-bit binaries are supported only by the open-source version of the library. 
 
 * ``<lib>`` - ``libtbb``, ``libtbbmalloc``, ``libtbbmalloc_proxy`` or ``libtbbbind``
-
 * ``<variant>`` - ``_debug`` or empty
-
-* ``<version>`` - binary version in a form of ``<major>.<minor>``
\ No newline at end of file
+* ``<version>`` - binary version in a form of ``<major>.<minor>``
diff --git a/third-party/tbb/doc/main/tbb_userguide/Migration_Guide/Task_Scheduler_Init.rst b/third-party/tbb/doc/main/tbb_userguide/Migration_Guide/Task_Scheduler_Init.rst
index aa8658acf8..6acdb272eb 100644
--- a/third-party/tbb/doc/main/tbb_userguide/Migration_Guide/Task_Scheduler_Init.rst
+++ b/third-party/tbb/doc/main/tbb_userguide/Migration_Guide/Task_Scheduler_Init.rst
@@ -14,27 +14,27 @@ Querying the default number of threads
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 * `oneapi::tbb::info::default_concurrency()
-  <https://spec.oneapi.com/versions/latest/elements/oneTBB/source/info_namespace.html>`_
+  <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/info_namespace>`_
   returns the maximum concurrency that will be created by *default* in implicit or explicit ``task_arena``.
 
 * `oneapi::tbb::this_task_arena::max_concurrency()
-  <https://spec.oneapi.com/versions/latest/elements/oneTBB/source/task_scheduler/task_arena/this_task_arena_ns.html>`_
+  <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/task_scheduler/task_arena/this_task_arena_ns>`_
   returns the maximum number of threads available for the parallel algorithms within the current context
   (or *default* if an implicit arena is not initialized)
 
 * `oneapi::tbb::global_control::active_value(tbb::global_control::max_allowed_parallelism)
-  <https://spec.oneapi.com/versions/latest/elements/oneTBB/source/task_scheduler/scheduling_controls/global_control_cls.html>`_
+  <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/task_scheduler/scheduling_controls/global_control_cls>`_
   returns the current limit of the thread pool (or *default* if oneTBB scheduler is not initialized)
 
 Setting the maximum concurrency
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 * `task_arena(/* max_concurrency */)
-  <https://spec.oneapi.com/versions/latest/elements/oneTBB/source/task_scheduler/task_arena/this_task_arena_ns.html>`_
+  <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/task_scheduler/task_arena/this_task_arena_ns>`_
   limits the maximum concurrency of the parallel algorithm running inside ``task_arena``
 
 * `tbb::global_control(tbb::global_control::max_allowed_parallelism, /* max_concurrency */)
-  <https://spec.oneapi.com/versions/latest/elements/oneTBB/source/task_scheduler/scheduling_controls/global_control_cls.html>`_
+  <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/task_scheduler/scheduling_controls/global_control_cls>`_
   limits the total number of oneTBB worker threads
 
 Examples
@@ -116,7 +116,7 @@ The limited parallelism:
 Setting thread stack size
 ---------------------------------------
 Use `oneapi::tbb::global_control(oneapi::tbb::global_control::thread_stack_size, /* stack_size */)
-<https://spec.oneapi.com/versions/latest/elements/oneTBB/source/task_scheduler/scheduling_controls/global_control_cls.html>`_
+<https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/task_scheduler/scheduling_controls/global_control_cls>`_
 to set the stack size for oneTBB worker threads:
 
 .. code:: cpp
@@ -141,7 +141,7 @@ to set the stack size for oneTBB worker threads:
 Terminating oneTBB scheduler
 ---------------------------------------
 
-`task_scheduler_handle <https://oneapi-src.github.io/oneAPI-spec/spec/elements/oneTBB/source/task_scheduler/scheduling_controls/task_scheduler_handle_cls.html>`_
+`task_scheduler_handle <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/task_scheduler/scheduling_controls/task_scheduler_handle_cls>`_
 allows waiting for oneTBB worker threads completion:
 
 .. code:: cpp
diff --git a/third-party/tbb/doc/main/tbb_userguide/Windows_C_Dynamic_Memory_Interface_Replacement.rst b/third-party/tbb/doc/main/tbb_userguide/Windows_C_Dynamic_Memory_Interface_Replacement.rst
index f4f78ae567..cd2d2e1a93 100644
--- a/third-party/tbb/doc/main/tbb_userguide/Windows_C_Dynamic_Memory_Interface_Replacement.rst
+++ b/third-party/tbb/doc/main/tbb_userguide/Windows_C_Dynamic_Memory_Interface_Replacement.rst
@@ -44,7 +44,6 @@ To do the replacement use one of the following methods:
 -  Alternatively, add the following parameters to the linker options for
    the .exe or .dll file that is loaded during application startup.
 
-
    For 32-bit code (note the triple underscore):
 
 
@@ -52,8 +51,7 @@ To do the replacement use one of the following methods:
 
 
       tbbmalloc_proxy.lib /INCLUDE:"___TBB_malloc_proxy"
-
-
+   
    For 64-bit code (note the double underscore):
 
 
diff --git a/third-party/tbb/doc/main/tbb_userguide/Windows_OS_ug.rst b/third-party/tbb/doc/main/tbb_userguide/Windows_OS_ug.rst
index 3fc4a5a223..85fc3306ce 100644
--- a/third-party/tbb/doc/main/tbb_userguide/Windows_OS_ug.rst
+++ b/third-party/tbb/doc/main/tbb_userguide/Windows_OS_ug.rst
@@ -30,12 +30,13 @@ structure for Windows\*, relative to <*tbb_install_dir*>.
         - Same as corresponding ``.dll`` file.
         - \
 
-where
+Where
 
 * ``<arch>`` - ``ia32`` or ``intel64``
 
-* ``<lib>`` - ``tbb``, ``tbbmalloc``, ``tbbmalloc_proxy`` or ``tbbbind``
+  .. note:: Starting with oneTBB 2022.0, 32-bit binaries are supported only by the open-source version of the library.
 
+* ``<lib>`` - ``tbb``, ``tbbmalloc``, ``tbbmalloc_proxy`` or ``tbbbind``
 * ``<vcversion>`` 
 
   - ``14`` - use for dynamic linkage  with the CRT
@@ -47,11 +48,10 @@ where
   - ``_mt`` - use for static linkage with the CRT
 
 * ``<variant>`` - ``_debug`` or empty
-
 * ``<version>`` - binary version
  
-The last column shows which environment variables are used by the
-Microsoft\* Visual C++\* or Intel® C++ Compiler Classic or Intel® oneAPI DPC++/C++ Compiler to find these
+The last column shows, which environment variables are used by the
+Microsoft\* Visual C++\* or Intel® C++ Compiler Classic or Intel® oneAPI DPC++/C++ Compiler, to find these
 subdirectories.
 
 .. CAUTION:: 
diff --git a/third-party/tbb/examples/task_group/sudoku/README.md b/third-party/tbb/examples/task_group/sudoku/README.md
index 0e3ef499ea..9f59bbb446 100644
--- a/third-party/tbb/examples/task_group/sudoku/README.md
+++ b/third-party/tbb/examples/task_group/sudoku/README.md
@@ -1,4 +1,4 @@
-# Fractal sample
+# Sudoku sample
 This directory contains an example that finds all solutions to a Sudoku board.
 
 It uses a straightforward state-space search algorithm that exhibits OR-parallelism. It can be optionally run until it obtains just the first solution. The point of the example is to teach how to use the `task_group` interface.
@@ -11,9 +11,8 @@ cmake --build .
 
 ## Running the sample
 ### Predefined make targets
-* `make run_fractal` - executes the example with predefined parameters.
-* `make perf_run_fractal` - executes the example with suggested parameters to measure the oneTBB performance.
-* `make light_test_fractal` - executes the example with suggested parameters to reduce execution time.
+* `make run_sudoku` - executes the example with predefined parameters.
+* `make perf_run_sudoku` - executes the example with suggested parameters to measure the oneTBB performance.
 
 ### Application parameters
 Usage:
diff --git a/third-party/tbb/examples/test_all/fibonacci/README.md b/third-party/tbb/examples/test_all/fibonacci/README.md
index 3d1d795df8..f65edcece7 100644
--- a/third-party/tbb/examples/test_all/fibonacci/README.md
+++ b/third-party/tbb/examples/test_all/fibonacci/README.md
@@ -1,4 +1,4 @@
-# Fractal sample
+# Fibonacci sample
 This directory contains an example that computes Fibonacci numbers in several different ways.
 
 The purpose of the example is to exercise every include file and class in Intel® oneAPI Threading Building Blocks. Most of the computations are deliberately silly and not expected to show any speedup on multiprocessors.
@@ -11,9 +11,7 @@ cmake --build .
 
 ## Running the sample
 ### Predefined make targets
-* `make run_fractal` - executes the example with predefined parameters.
-* `make perf_run_fractal` - executes the example with suggested parameters to measure the oneTBB performance.
-* `make light_test_fractal` - executes the example with suggested parameters to reduce execution time.
+* `make run_fibonacci` - executes the example with predefined parameters.
 
 ### Application parameters
 Usage:
diff --git a/third-party/tbb/include/oneapi/tbb/collaborative_call_once.h b/third-party/tbb/include/oneapi/tbb/collaborative_call_once.h
index db082f891a..18e3bbb245 100644
--- a/third-party/tbb/include/oneapi/tbb/collaborative_call_once.h
+++ b/third-party/tbb/include/oneapi/tbb/collaborative_call_once.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2021 Intel Corporation
+    Copyright (c) 2021-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -32,6 +32,27 @@ namespace d1 {
     #pragma warning (disable: 4324)
 #endif
 
+template <typename F>
+class collaborative_call_stack_task : public task {
+    const F& m_func;
+    wait_context& m_wait_ctx;
+
+    void finalize() {
+        m_wait_ctx.release();
+    }
+    task* execute(d1::execution_data&) override {
+        task* res = d2::task_ptr_or_nullptr(m_func);
+        finalize();
+        return res;
+    }
+    task* cancel(d1::execution_data&) override {
+        finalize();
+        return nullptr;
+    }
+public:
+    collaborative_call_stack_task(const F& f, wait_context& wctx) : m_func(f), m_wait_ctx(wctx) {}
+};
+
 constexpr std::uintptr_t collaborative_once_max_references = max_nfs_size;
 constexpr std::uintptr_t collaborative_once_references_mask = collaborative_once_max_references-1;
 
@@ -103,7 +124,7 @@ class alignas(max_nfs_size) collaborative_once_runner : no_copy {
                 task_group_context context{ task_group_context::bound,
                     task_group_context::default_traits | task_group_context::concurrent_wait };
 
-                function_stack_task<F> t{ std::forward<F>(f), m_storage.m_wait_context };
+                collaborative_call_stack_task<F> t{ std::forward<F>(f), m_storage.m_wait_context };
 
                 // Set the ready flag after entering the execute body to prevent
                 // moonlighting threads from occupying all slots inside the arena.
@@ -151,7 +172,7 @@ class collaborative_once_flag : no_copy {
             spin_wait_until_eq(m_state, expected);
         } while (!m_state.compare_exchange_strong(expected, desired));
     }
-    
+
     template <typename Fn>
     void do_collaborative_call_once(Fn&& f) {
         std::uintptr_t expected = m_state.load(std::memory_order_acquire);
diff --git a/third-party/tbb/include/oneapi/tbb/concurrent_unordered_map.h b/third-party/tbb/include/oneapi/tbb/concurrent_unordered_map.h
index 336425cc8f..9cade0a94e 100644
--- a/third-party/tbb/include/oneapi/tbb/concurrent_unordered_map.h
+++ b/third-party/tbb/include/oneapi/tbb/concurrent_unordered_map.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -24,14 +24,14 @@
 
 namespace tbb {
 namespace detail {
-namespace d1 {
+namespace d2 {
 
 template <typename Key, typename T, typename Hash, typename KeyEqual, typename Allocator, bool AllowMultimapping>
 struct concurrent_unordered_map_traits {
     using value_type = std::pair<const Key, T>;
     using key_type = Key;
     using allocator_type = Allocator;
-    using hash_compare_type = hash_compare<Key, Hash, KeyEqual>;
+    using hash_compare_type = d1::hash_compare<Key, Hash, KeyEqual>;
     static constexpr bool allow_multimapping = AllowMultimapping;
 
     static constexpr const key_type& get_key( const value_type& value ) {
@@ -399,13 +399,13 @@ void swap( concurrent_unordered_multimap<Key, T, Hash, KeyEqual, Allocator>& lhs
     lhs.swap(rhs);
 }
 
-} // namespace d1
+} // namespace d2
 } // namespace detail
 
 inline namespace v1 {
 
-using detail::d1::concurrent_unordered_map;
-using detail::d1::concurrent_unordered_multimap;
+using detail::d2::concurrent_unordered_map;
+using detail::d2::concurrent_unordered_multimap;
 using detail::split;
 
 } // inline namespace v1
diff --git a/third-party/tbb/include/oneapi/tbb/concurrent_unordered_set.h b/third-party/tbb/include/oneapi/tbb/concurrent_unordered_set.h
index c135b92222..b7e4b4cafc 100644
--- a/third-party/tbb/include/oneapi/tbb/concurrent_unordered_set.h
+++ b/third-party/tbb/include/oneapi/tbb/concurrent_unordered_set.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -23,14 +23,14 @@
 
 namespace tbb {
 namespace detail {
-namespace d1 {
+namespace d2 {
 
 template <typename Key, typename Hash, typename KeyEqual, typename Allocator, bool AllowMultimapping>
 struct concurrent_unordered_set_traits {
     using key_type = Key;
     using value_type = key_type;
     using allocator_type = Allocator;
-    using hash_compare_type = hash_compare<key_type, Hash, KeyEqual>;
+    using hash_compare_type = d1::hash_compare<key_type, Hash, KeyEqual>;
     static constexpr bool allow_multimapping = AllowMultimapping;
 
     static constexpr const key_type& get_key( const value_type& value ) {
@@ -318,13 +318,13 @@ void swap( concurrent_unordered_multiset<Key, Hash, KeyEqual, Allocator>& lhs,
     lhs.swap(rhs);
 }
 
-} // namespace d1
+} // namespace d2
 } // namespace detail
 
 inline namespace v1 {
 
-using detail::d1::concurrent_unordered_set;
-using detail::d1::concurrent_unordered_multiset;
+using detail::d2::concurrent_unordered_set;
+using detail::d2::concurrent_unordered_multiset;
 using detail::split;
 
 } // inline namespace v1
diff --git a/third-party/tbb/include/oneapi/tbb/detail/_concurrent_unordered_base.h b/third-party/tbb/include/oneapi/tbb/detail/_concurrent_unordered_base.h
index 408292086a..85f54d0a57 100644
--- a/third-party/tbb/include/oneapi/tbb/detail/_concurrent_unordered_base.h
+++ b/third-party/tbb/include/oneapi/tbb/detail/_concurrent_unordered_base.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -46,7 +46,7 @@
 
 namespace tbb {
 namespace detail {
-namespace d1 {
+namespace d2 {
 
 template <typename Traits>
 class concurrent_unordered_base;
@@ -171,7 +171,7 @@ class value_node : public list_node<SokeyType>
     value_node( sokey_type ord_key ) : base_type(ord_key) {}
     ~value_node() {}
     value_type* storage() {
-        return reinterpret_cast<value_type*>(&my_value);
+        return &my_value;
     }
 
     value_type& value() {
@@ -179,8 +179,9 @@ class value_node : public list_node<SokeyType>
     }
 
 private:
-    using aligned_storage_type = typename std::aligned_storage<sizeof(value_type)>::type;
-    aligned_storage_type my_value;
+    union {
+        value_type my_value;
+    };
 }; // class value_node
 
 template <typename Traits>
@@ -237,7 +238,7 @@ class concurrent_unordered_base {
     template <typename T>
     using is_transparent = dependent_bool<has_transparent_key_equal<key_type, hasher, key_equal>, T>;
 public:
-    using node_type = node_handle<key_type, value_type, value_node_type, allocator_type>;
+    using node_type = d1::node_handle<key_type, value_type, value_node_type, allocator_type>;
 
     explicit concurrent_unordered_base( size_type bucket_count, const hasher& hash = hasher(),
                                         const key_equal& equal = key_equal(), const allocator_type& alloc = allocator_type() )
@@ -441,7 +442,7 @@ class concurrent_unordered_base {
 
     std::pair<iterator, bool> insert( node_type&& nh ) {
         if (!nh.empty()) {
-            value_node_ptr insert_node = node_handle_accessor::get_node_ptr(nh);
+            value_node_ptr insert_node = d1::node_handle_accessor::get_node_ptr(nh);
             auto init_node = [&insert_node]( sokey_type order_key )->value_node_ptr {
                 insert_node->init(order_key);
                 return insert_node;
@@ -451,7 +452,7 @@ class concurrent_unordered_base {
                 // If the insertion succeeded - set node handle to the empty state
                 __TBB_ASSERT(insert_result.remaining_node == nullptr,
                             "internal_insert_node should not return the remaining node if the insertion succeeded");
-                node_handle_accessor::deactivate(nh);
+                d1::node_handle_accessor::deactivate(nh);
             }
             return { iterator(insert_result.node_with_equal_key), insert_result.inserted };
         }
@@ -521,12 +522,12 @@ class concurrent_unordered_base {
 
     node_type unsafe_extract( const_iterator pos ) {
         internal_extract(pos.get_node_ptr());
-        return node_handle_accessor::construct<node_type>(pos.get_node_ptr());
+        return d1::node_handle_accessor::construct<node_type>(pos.get_node_ptr());
     }
 
     node_type unsafe_extract( iterator pos ) {
         internal_extract(pos.get_node_ptr());
-        return node_handle_accessor::construct<node_type>(pos.get_node_ptr());
+        return d1::node_handle_accessor::construct<node_type>(pos.get_node_ptr());
     }
 
     node_type unsafe_extract( const key_type& key ) {
@@ -787,11 +788,11 @@ class concurrent_unordered_base {
     static constexpr size_type pointers_per_embedded_table = sizeof(size_type) * 8 - 1;
 
     class unordered_segment_table
-        : public segment_table<std::atomic<node_ptr>, allocator_type, unordered_segment_table, pointers_per_embedded_table>
+        : public d1::segment_table<std::atomic<node_ptr>, allocator_type, unordered_segment_table, pointers_per_embedded_table>
     {
         using self_type = unordered_segment_table;
         using atomic_node_ptr = std::atomic<node_ptr>;
-        using base_type = segment_table<std::atomic<node_ptr>, allocator_type, unordered_segment_table, pointers_per_embedded_table>;
+        using base_type = d1::segment_table<std::atomic<node_ptr>, allocator_type, unordered_segment_table, pointers_per_embedded_table>;
         using segment_type = typename base_type::segment_type;
         using base_allocator_type = typename base_type::allocator_type;
 
@@ -921,7 +922,7 @@ class concurrent_unordered_base {
             node_allocator_traits::deallocate(dummy_node_allocator, node, 1);
         } else {
             // GCC 11.1 issues a warning here that incorrect destructor might be called for dummy_nodes
-            #if (__TBB_GCC_VERSION >= 110100 && __TBB_GCC_VERSION < 140000 ) && !__clang__ && !__INTEL_COMPILER
+            #if (__TBB_GCC_VERSION >= 110100 && __TBB_GCC_VERSION < 150000 ) && !__clang__ && !__INTEL_COMPILER
             volatile
             #endif
             value_node_ptr val_node = static_cast<value_node_ptr>(node);
@@ -1212,7 +1213,7 @@ class concurrent_unordered_base {
 
                     // Node handle with curr cannot be used directly in insert call, because
                     // the destructor of node_type will destroy curr
-                    node_type curr_node = node_handle_accessor::construct<node_type>(curr);
+                    node_type curr_node = d1::node_handle_accessor::construct<node_type>(curr);
 
                     // If the insertion fails - return ownership of the node to the source
                     if (!insert(std::move(curr_node)).second) {
@@ -1230,7 +1231,7 @@ class concurrent_unordered_base {
                         curr->set_next(next_node);
                         source_prev->set_next(curr);
                         source_prev = curr;
-                        node_handle_accessor::deactivate(curr_node);
+                        d1::node_handle_accessor::deactivate(curr_node);
                     } else {
                         source.my_size.fetch_sub(1, std::memory_order_relaxed);
                     }
@@ -1507,7 +1508,7 @@ bool operator!=( const concurrent_unordered_base<Traits>& lhs,
 #pragma warning(pop) // warning 4127 is back
 #endif
 
-} // namespace d1
+} // namespace d2
 } // namespace detail
 } // namespace tbb
 
diff --git a/third-party/tbb/include/oneapi/tbb/detail/_config.h b/third-party/tbb/include/oneapi/tbb/detail/_config.h
index d6705e154c..e676b1558b 100644
--- a/third-party/tbb/include/oneapi/tbb/detail/_config.h
+++ b/third-party/tbb/include/oneapi/tbb/detail/_config.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -335,7 +335,7 @@
 
 #define __TBB_TSX_INTRINSICS_PRESENT (__RTM__ || __INTEL_COMPILER || (_MSC_VER>=1700 && (__TBB_x86_64 || __TBB_x86_32)))
 
-#define __TBB_WAITPKG_INTRINSICS_PRESENT ((__INTEL_COMPILER >= 1900 || __TBB_GCC_VERSION >= 110000 || __TBB_CLANG_VERSION >= 120000) \
+#define __TBB_WAITPKG_INTRINSICS_PRESENT ((__INTEL_COMPILER >= 1900 || (__TBB_GCC_VERSION >= 110000 && __TBB_GNU_ASM_VERSION >= 2032) || __TBB_CLANG_VERSION >= 120000) \
                                          && (_WIN32 || _WIN64 || __unix__ || __APPLE__) && (__TBB_x86_32 || __TBB_x86_64) && !__ANDROID__)
 
 /** Internal TBB features & modes **/
@@ -521,6 +521,11 @@
 #define __TBB_PREVIEW_FLOW_GRAPH_NODE_SET       (TBB_PREVIEW_FLOW_GRAPH_FEATURES)
 #endif
 
+#ifndef __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+#define __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT (TBB_PREVIEW_FLOW_GRAPH_FEATURES \
+                                                   || TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT)
+#endif
+
 #if TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS
 #define __TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS 1
 #endif
diff --git a/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_body_impl.h b/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_body_impl.h
index 8ac11211f6..21da06ce03 100644
--- a/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_body_impl.h
+++ b/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_body_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #error Do not #include this internal file directly; use public TBB headers instead.
 #endif
 
-// included in namespace tbb::detail::d1 (in flow_graph.h)
+// included in namespace tbb::detail::d2 (in flow_graph.h)
 
 typedef std::uint64_t tag_value;
 
@@ -53,7 +53,7 @@ namespace graph_policy_namespace {
     // K == type of field used for key-matching.  Each tag-matching port will be provided
     // functor that, given an object accepted by the port, will return the
     /// field of type K being used for matching.
-    template<typename K, typename KHash=tbb_hash_compare<typename std::decay<K>::type > >
+    template<typename K, typename KHash=d1::tbb_hash_compare<typename std::decay<K>::type > >
         __TBB_requires(tbb::detail::hash_compare<KHash, K>)
     struct key_matching {
         typedef K key_type;
@@ -77,7 +77,7 @@ template< typename Output >
 class input_body : no_assign {
 public:
     virtual ~input_body() {}
-    virtual Output operator()(flow_control& fc) = 0;
+    virtual Output operator()(d1::flow_control& fc) = 0;
     virtual input_body* clone() = 0;
 };
 
@@ -86,7 +86,7 @@ template< typename Output, typename Body>
 class input_body_leaf : public input_body<Output> {
 public:
     input_body_leaf( const Body &_body ) : body(_body) { }
-    Output operator()(flow_control& fc) override { return body(fc); }
+    Output operator()(d1::flow_control& fc) override { return body(fc); }
     input_body_leaf* clone() override {
         return new input_body_leaf< Output, Body >(body);
     }
@@ -249,12 +249,12 @@ template< typename NodeType >
 class forward_task_bypass : public graph_task {
     NodeType &my_node;
 public:
-    forward_task_bypass( graph& g, small_object_allocator& allocator, NodeType &n
+    forward_task_bypass( graph& g, d1::small_object_allocator& allocator, NodeType &n
                          , node_priority_t node_priority = no_priority
     ) : graph_task(g, allocator, node_priority),
     my_node(n) {}
 
-    task* execute(execution_data& ed) override {
+    d1::task* execute(d1::execution_data& ed) override {
         graph_task* next_task = my_node.forward_task();
         if (SUCCESSFULLY_ENQUEUED == next_task)
             next_task = nullptr;
@@ -264,7 +264,7 @@ class forward_task_bypass : public graph_task {
         return next_task;
     }
 
-    task* cancel(execution_data& ed) override {
+    d1::task* cancel(d1::execution_data& ed) override {
         finalize<forward_task_bypass>(ed);
         return nullptr;
     }
@@ -272,29 +272,57 @@ class forward_task_bypass : public graph_task {
 
 //! A task that calls a node's apply_body_bypass function, passing in an input of type Input
 //  return the task* unless it is SUCCESSFULLY_ENQUEUED, in which case return nullptr
-template< typename NodeType, typename Input >
-class apply_body_task_bypass : public graph_task {
+template< typename NodeType, typename Input, typename BaseTaskType = graph_task>
+class apply_body_task_bypass
+    : public BaseTaskType
+{
     NodeType &my_node;
     Input my_input;
+
+    using check_metainfo = std::is_same<BaseTaskType, graph_task>;
+    using without_metainfo = std::true_type;
+    using with_metainfo = std::false_type;
+
+    graph_task* call_apply_body_bypass_impl(without_metainfo) {
+        return my_node.apply_body_bypass(my_input
+                                         __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* call_apply_body_bypass_impl(with_metainfo) {
+        return my_node.apply_body_bypass(my_input, message_metainfo{this->get_msg_wait_context_vertices()});
+    }
+#endif
+
+    graph_task* call_apply_body_bypass() {
+        return call_apply_body_bypass_impl(check_metainfo{});
+    }
+
 public:
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    template <typename Metainfo>
+    apply_body_task_bypass( graph& g, d1::small_object_allocator& allocator, NodeType &n, const Input &i,
+                            node_priority_t node_priority, Metainfo&& metainfo )
+        : BaseTaskType(g, allocator, node_priority, std::forward<Metainfo>(metainfo).waiters())
+        , my_node(n), my_input(i) {}
+#endif
 
-    apply_body_task_bypass( graph& g, small_object_allocator& allocator, NodeType &n, const Input &i
-                            , node_priority_t node_priority = no_priority
-    ) : graph_task(g, allocator, node_priority),
-        my_node(n), my_input(i) {}
+    apply_body_task_bypass( graph& g, d1::small_object_allocator& allocator, NodeType& n, const Input& i,
+                            node_priority_t node_priority = no_priority )
+        : BaseTaskType(g, allocator, node_priority), my_node(n), my_input(i) {}
 
-    task* execute(execution_data& ed) override {
-        graph_task* next_task = my_node.apply_body_bypass( my_input );
+    d1::task* execute(d1::execution_data& ed) override {
+        graph_task* next_task = call_apply_body_bypass();
         if (SUCCESSFULLY_ENQUEUED == next_task)
             next_task = nullptr;
         else if (next_task)
             next_task = prioritize_task(my_node.graph_reference(), *next_task);
-        finalize<apply_body_task_bypass>(ed);
+        BaseTaskType::template finalize<apply_body_task_bypass>(ed);
         return next_task;
     }
 
-    task* cancel(execution_data& ed) override {
-        finalize<apply_body_task_bypass>(ed);
+    d1::task* cancel(d1::execution_data& ed) override {
+        BaseTaskType::template finalize<apply_body_task_bypass>(ed);
         return nullptr;
     }
 };
@@ -304,10 +332,10 @@ template< typename NodeType >
 class input_node_task_bypass : public graph_task {
     NodeType &my_node;
 public:
-    input_node_task_bypass( graph& g, small_object_allocator& allocator, NodeType &n )
+    input_node_task_bypass( graph& g, d1::small_object_allocator& allocator, NodeType &n )
         : graph_task(g, allocator), my_node(n) {}
 
-    task* execute(execution_data& ed) override {
+    d1::task* execute(d1::execution_data& ed) override {
         graph_task* next_task = my_node.apply_body_bypass( );
         if (SUCCESSFULLY_ENQUEUED == next_task)
             next_task = nullptr;
@@ -317,7 +345,7 @@ class input_node_task_bypass : public graph_task {
         return next_task;
     }
 
-    task* cancel(execution_data& ed) override {
+    d1::task* cancel(d1::execution_data& ed) override {
         finalize<input_node_task_bypass>(ed);
         return nullptr;
     }
@@ -343,6 +371,15 @@ class threshold_regulator<T, DecrementType,
         return result;
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    // Intentionally ignore the metainformation
+    // If there are more items associated with passed metainfo to be processed
+    // They should be stored in the buffer before the limiter_node
+    graph_task* try_put_task(const DecrementType& value, const message_metainfo&) override {
+        return try_put_task(value);
+    }
+#endif
+
     graph& graph_reference() const override {
         return my_node->my_graph;
     }
@@ -361,7 +398,14 @@ class threshold_regulator<T, continue_msg, void> : public continue_receiver, no_
 
     T *my_node;
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    // Intentionally ignore the metainformation
+    // If there are more items associated with passed metainfo to be processed
+    // They should be stored in the buffer before the limiter_node
+    graph_task* execute(const message_metainfo&) override {
+#else
     graph_task* execute() override {
+#endif
         return my_node->decrement_counter( 1 );
     }
 
diff --git a/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_cache_impl.h b/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_cache_impl.h
index 059f198055..647f3dc1b6 100644
--- a/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_cache_impl.h
+++ b/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_cache_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #error Do not #include this internal file directly; use public TBB headers instead.
 #endif
 
-// included in namespace tbb::detail::d1 (in flow_graph.h)
+// included in namespace tbb::detail::d2 (in flow_graph.h)
 
 //! A node_cache maintains a std::queue of elements of type T.  Each operation is protected by a lock.
 template< typename T, typename M=spin_mutex >
@@ -98,9 +98,12 @@ class predecessor_cache : public node_cache< sender<T>, M > {
         // Do not work with the passed pointer here as it may not be fully initialized yet
     }
 
-    bool get_item( output_type& v ) {
+private:
+    bool get_item_impl( output_type& v
+                        __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo* metainfo_ptr = nullptr) )
+    {
 
-        bool msg = false;
+        bool successful_get = false;
 
         do {
             predecessor_type *src;
@@ -113,18 +116,35 @@ class predecessor_cache : public node_cache< sender<T>, M > {
             }
 
             // Try to get from this sender
-            msg = src->try_get( v );
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            if (metainfo_ptr) {
+                successful_get = src->try_get( v, *metainfo_ptr );
+            } else
+#endif
+            {
+                successful_get = src->try_get( v );
+            }
 
-            if (msg == false) {
+            if (successful_get == false) {
                 // Relinquish ownership of the edge
                 register_successor(*src, *my_owner);
             } else {
                 // Retain ownership of the edge
                 this->add(*src);
             }
-        } while ( msg == false );
-        return msg;
+        } while ( successful_get == false );
+        return successful_get;
     }
+public:
+    bool get_item( output_type& v ) {
+        return get_item_impl(v);
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    bool get_item( output_type& v, message_metainfo& metainfo ) {
+        return get_item_impl(v, &metainfo);
+    }
+#endif
 
     // If we are removing arcs (rf_clear_edges), call clear() rather than reset().
     void reset() {
@@ -157,8 +177,9 @@ class reservable_predecessor_cache : public predecessor_cache< T, M > {
         // Do not work with the passed pointer here as it may not be fully initialized yet
     }
 
-    bool try_reserve( output_type &v ) {
-        bool msg = false;
+private:
+    bool try_reserve_impl( output_type &v __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo* metainfo) ) {
+        bool successful_reserve = false;
 
         do {
             predecessor_type* pred = nullptr;
@@ -172,9 +193,16 @@ class reservable_predecessor_cache : public predecessor_cache< T, M > {
             }
 
             // Try to get from this sender
-            msg = pred->try_reserve( v );
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            if (metainfo) {
+                successful_reserve = pred->try_reserve( v, *metainfo );
+            } else
+#endif
+            {
+                successful_reserve = pred->try_reserve( v );
+            }
 
-            if (msg == false) {
+            if (successful_reserve == false) {
                 typename mutex_type::scoped_lock lock(this->my_mutex);
                 // Relinquish ownership of the edge
                 register_successor( *pred, *this->my_owner );
@@ -183,11 +211,21 @@ class reservable_predecessor_cache : public predecessor_cache< T, M > {
                 // Retain ownership of the edge
                 this->add( *pred);
             }
-        } while ( msg == false );
+        } while ( successful_reserve == false );
 
-        return msg;
+        return successful_reserve;
+    }
+public:
+    bool try_reserve( output_type& v ) {
+        return try_reserve_impl(v __TBB_FLOW_GRAPH_METAINFO_ARG(nullptr));
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    bool try_reserve( output_type& v, message_metainfo& metainfo ) {
+        return try_reserve_impl(v, &metainfo);
+    }
+#endif
+
     bool try_release() {
         reserved_src.load(std::memory_order_relaxed)->try_release();
         reserved_src.store(nullptr, std::memory_order_relaxed);
@@ -268,6 +306,9 @@ class successor_cache : no_copy {
     }
 
     virtual graph_task* try_put_task( const T& t ) = 0;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    virtual graph_task* try_put_task( const T& t, const message_metainfo& metainfo ) = 0;
+#endif
 };  // successor_cache<T>
 
 //! An abstract cache of successors, specialized to continue_msg
@@ -327,6 +368,9 @@ class successor_cache< continue_msg, M > : no_copy {
     }
 
     virtual graph_task* try_put_task( const continue_msg& t ) = 0;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    virtual graph_task* try_put_task( const continue_msg& t, const message_metainfo& metainfo ) = 0;
+#endif
 };  // successor_cache< continue_msg >
 
 //! A cache of successors that are broadcast to
@@ -336,19 +380,12 @@ class broadcast_cache : public successor_cache<T, M> {
     typedef M mutex_type;
     typedef typename successor_cache<T,M>::successors_type successors_type;
 
-public:
-
-    broadcast_cache( typename base_type::owner_type* owner ): base_type(owner) {
-        // Do not work with the passed pointer here as it may not be fully initialized yet
-    }
-
-    // as above, but call try_put_task instead, and return the last task we received (if any)
-    graph_task* try_put_task( const T &t ) override {
+    graph_task* try_put_task_impl( const T& t __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo) ) {
         graph_task * last_task = nullptr;
         typename mutex_type::scoped_lock l(this->my_mutex, /*write=*/true);
         typename successors_type::iterator i = this->my_successors.begin();
         while ( i != this->my_successors.end() ) {
-            graph_task *new_task = (*i)->try_put_task(t);
+            graph_task *new_task = (*i)->try_put_task(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
             // workaround for icc bug
             graph& graph_ref = (*i)->graph_reference();
             last_task = combine_tasks(graph_ref, last_task, new_task);  // enqueue if necessary
@@ -365,6 +402,21 @@ class broadcast_cache : public successor_cache<T, M> {
         }
         return last_task;
     }
+public:
+
+    broadcast_cache( typename base_type::owner_type* owner ): base_type(owner) {
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
+
+    graph_task* try_put_task( const T &t ) override {
+        return try_put_task_impl(t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task( const T &t, const message_metainfo& metainfo ) override {
+        return try_put_task_impl(t, metainfo);
+    }
+#endif
 
     // call try_put_task and return list of received tasks
     bool gather_successful_try_puts( const T &t, graph_task_list& tasks ) {
@@ -411,11 +463,15 @@ class round_robin_cache : public successor_cache<T, M> {
         return this->my_successors.size();
     }
 
-    graph_task* try_put_task( const T &t ) override {
+private:
+
+    graph_task* try_put_task_impl( const T &t
+                                   __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo) )
+    {
         typename mutex_type::scoped_lock l(this->my_mutex, /*write=*/true);
         typename successors_type::iterator i = this->my_successors.begin();
         while ( i != this->my_successors.end() ) {
-            graph_task* new_task = (*i)->try_put_task(t);
+            graph_task* new_task = (*i)->try_put_task(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
             if ( new_task ) {
                 return new_task;
             } else {
@@ -429,6 +485,17 @@ class round_robin_cache : public successor_cache<T, M> {
         }
         return nullptr;
     }
+
+public:
+    graph_task* try_put_task(const T& t) override {
+        return try_put_task_impl(t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task( const T& t, const message_metainfo& metainfo ) override {
+        return try_put_task_impl(t, metainfo);
+    }
+#endif
 };
 
 #endif // __TBB__flow_graph_cache_impl_H
diff --git a/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_impl.h b/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_impl.h
index 8207667f37..19e00a8ef1 100644
--- a/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_impl.h
+++ b/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -30,7 +30,7 @@
 namespace tbb {
 namespace detail {
 
-namespace d1 {
+namespace d2 {
 
 class graph_task;
 static graph_task* const SUCCESSFULLY_ENQUEUED = (graph_task*)-1;
@@ -123,32 +123,98 @@ void enqueue_in_graph_arena(graph &g, graph_task& arena_task);
 class graph;
 
 //! Base class for tasks generated by graph nodes.
-class graph_task : public task {
+class graph_task : public d1::task {
 public:
-    graph_task(graph& g, small_object_allocator& allocator
-               , node_priority_t node_priority = no_priority
-    )
-        : my_graph(g)
-        , priority(node_priority)
-        , my_allocator(allocator)
-    {}
+    graph_task(graph& g, d1::small_object_allocator& allocator,
+               node_priority_t node_priority = no_priority);
+
     graph& my_graph; // graph instance the task belongs to
     // TODO revamp: rename to my_priority
     node_priority_t priority;
     template <typename DerivedType>
-    void destruct_and_deallocate(const execution_data& ed);
+    void destruct_and_deallocate(const d1::execution_data& ed);
 protected:
     template <typename DerivedType>
-    void finalize(const execution_data& ed);
+    void finalize(const d1::execution_data& ed);
 private:
     // To organize task_list
     graph_task* my_next{ nullptr };
-    small_object_allocator my_allocator;
+    d1::small_object_allocator my_allocator;
+    d1::wait_tree_vertex_interface* my_reference_vertex;
     // TODO revamp: elaborate internal interfaces to avoid friends declarations
     friend class graph_task_list;
     friend graph_task* prioritize_task(graph& g, graph_task& gt);
 };
 
+inline bool is_this_thread_in_graph_arena(graph& g);
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+class trackable_messages_graph_task : public graph_task {
+public:
+    trackable_messages_graph_task(graph& g, d1::small_object_allocator& allocator,
+                                  node_priority_t node_priority,
+                                  const std::forward_list<d1::wait_context_vertex*>& msg_waiters)
+        : graph_task(g, allocator, node_priority)
+        , my_msg_wait_context_vertices(msg_waiters)
+    {
+        auto last_iterator = my_msg_reference_vertices.cbefore_begin();
+
+        for (auto& msg_waiter : my_msg_wait_context_vertices) {
+            // If the task is created by the thread outside the graph arena, the lifetime of the thread reference vertex
+            // may be shorter that the lifetime of the task, so thread reference vertex approach cannot be used
+            // and the task should be associated with the msg wait context itself
+            d1::wait_tree_vertex_interface* ref_vertex = is_this_thread_in_graph_arena(g) ?
+                                                         r1::get_thread_reference_vertex(msg_waiter) :
+                                                         msg_waiter;
+            last_iterator = my_msg_reference_vertices.emplace_after(last_iterator,
+                                                                    ref_vertex);
+            ref_vertex->reserve(1);
+        }
+    }
+
+    trackable_messages_graph_task(graph& g, d1::small_object_allocator& allocator,
+                                  node_priority_t node_priority,
+                                  std::forward_list<d1::wait_context_vertex*>&& msg_waiters)
+        : graph_task(g, allocator, node_priority)
+        , my_msg_wait_context_vertices(std::move(msg_waiters))
+    {
+    }
+
+    const std::forward_list<d1::wait_context_vertex*> get_msg_wait_context_vertices() const {
+        return my_msg_wait_context_vertices;
+    }
+
+protected:
+    template <typename DerivedType>
+    void finalize(const d1::execution_data& ed) {
+        auto wait_context_vertices = std::move(my_msg_wait_context_vertices);
+        auto msg_reference_vertices = std::move(my_msg_reference_vertices);
+        graph_task::finalize<DerivedType>(ed);
+
+        // If there is no thread reference vertices associated with the task
+        // then this task was created by transferring the ownership from other metainfo
+        // instance (e.g. while taking from the buffer)
+        if (msg_reference_vertices.empty()) {
+            for (auto& msg_waiter : wait_context_vertices) {
+                msg_waiter->release(1);
+            }
+        } else {
+            for (auto& msg_waiter : msg_reference_vertices) {
+                msg_waiter->release(1);
+            }
+        }
+    }
+private:
+    // Each task that holds information about single message wait_contexts should hold two lists
+    // The first one is wait_contexts associated with the message itself. They are needed
+    // to be able to broadcast the list of wait_contexts to the node successors while executing the task.
+    // The second list is a list of reference vertices for each wait_context_vertex in the first list
+    // to support the distributed reference counting schema
+    std::forward_list<d1::wait_context_vertex*> my_msg_wait_context_vertices;
+    std::forward_list<d1::wait_tree_vertex_interface*> my_msg_reference_vertices;
+}; // class trackable_messages_graph_task
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
 struct graph_task_comparator {
     bool operator()(const graph_task* left, const graph_task* right) {
         return left->priority < right->priority;
@@ -157,18 +223,18 @@ struct graph_task_comparator {
 
 typedef tbb::concurrent_priority_queue<graph_task*, graph_task_comparator> graph_task_priority_queue_t;
 
-class priority_task_selector : public task {
+class priority_task_selector : public d1::task {
 public:
-    priority_task_selector(graph_task_priority_queue_t& priority_queue, small_object_allocator& allocator)
+    priority_task_selector(graph_task_priority_queue_t& priority_queue, d1::small_object_allocator& allocator)
         : my_priority_queue(priority_queue), my_allocator(allocator), my_task() {}
-    task* execute(execution_data& ed) override {
+    task* execute(d1::execution_data& ed) override {
         next_task();
         __TBB_ASSERT(my_task, nullptr);
         task* t_next = my_task->execute(ed);
         my_allocator.delete_object(this, ed);
         return t_next;
     }
-    task* cancel(execution_data& ed) override {
+    task* cancel(d1::execution_data& ed) override {
         if (!my_task) {
             next_task();
         }
@@ -190,7 +256,7 @@ class priority_task_selector : public task {
     }
 
     graph_task_priority_queue_t& my_priority_queue;
-    small_object_allocator my_allocator;
+    d1::small_object_allocator my_allocator;
     graph_task* my_task;
 };
 
@@ -281,7 +347,7 @@ class graph : no_copy, public graph_proxy {
         caught_exception = false;
         try_call([this] {
             my_task_arena->execute([this] {
-                wait(my_wait_context, *my_context);
+                wait(my_wait_context_vertex.get_context(), *my_context);
             });
             cancelled = my_context->is_group_execution_cancelled();
         }).on_exception([this] {
@@ -332,7 +398,7 @@ class graph : no_copy, public graph_proxy {
     bool exception_thrown() { return caught_exception; }
 
 private:
-    wait_context my_wait_context;
+    d1::wait_context_vertex my_wait_context_vertex;
     task_group_context *my_context;
     bool own_context;
     bool cancelled;
@@ -349,19 +415,25 @@ class graph : no_copy, public graph_proxy {
 
     graph_task_priority_queue_t my_priority_queue;
 
+    d1::wait_context_vertex& get_wait_context_vertex() { return my_wait_context_vertex; }
+
     friend void activate_graph(graph& g);
     friend void deactivate_graph(graph& g);
     friend bool is_graph_active(graph& g);
+    friend bool is_this_thread_in_graph_arena(graph& g);
     friend graph_task* prioritize_task(graph& g, graph_task& arena_task);
     friend void spawn_in_graph_arena(graph& g, graph_task& arena_task);
     friend void enqueue_in_graph_arena(graph &g, graph_task& arena_task);
 
-    friend class task_arena_base;
+    friend class d1::task_arena_base;
+    friend class graph_task;
 
+    template <typename T>
+    friend class receiver;
 };  // class graph
 
 template<typename DerivedType>
-inline void graph_task::destruct_and_deallocate(const execution_data& ed) {
+inline void graph_task::destruct_and_deallocate(const d1::execution_data& ed) {
     auto allocator = my_allocator;
     // TODO: investigate if direct call of derived destructor gives any benefits.
     this->~graph_task();
@@ -369,10 +441,27 @@ inline void graph_task::destruct_and_deallocate(const execution_data& ed) {
 }
 
 template<typename DerivedType>
-inline void graph_task::finalize(const execution_data& ed) {
-    graph& g = my_graph;
+inline void graph_task::finalize(const d1::execution_data& ed) {
+    d1::wait_tree_vertex_interface* reference_vertex = my_reference_vertex;
     destruct_and_deallocate<DerivedType>(ed);
-    g.release_wait();
+    reference_vertex->release();
+}
+
+inline graph_task::graph_task(graph& g, d1::small_object_allocator& allocator,
+                              node_priority_t node_priority)
+    : my_graph(g)
+    , priority(node_priority)
+    , my_allocator(allocator)
+{
+    // If the task is created by the thread outside the graph arena, the lifetime of the thread reference vertex
+    // may be shorter that the lifetime of the task, so thread reference vertex approach cannot be used
+    // and the task should be associated with the graph wait context itself
+    // TODO: consider how reference counting can be improved for such a use case. Most common example is the async_node
+    d1::wait_context_vertex* graph_wait_context_vertex = &my_graph.get_wait_context_vertex();
+    my_reference_vertex = is_this_thread_in_graph_arena(g) ? r1::get_thread_reference_vertex(graph_wait_context_vertex)
+                                                           : graph_wait_context_vertex;
+    __TBB_ASSERT(my_reference_vertex, nullptr);
+    my_reference_vertex->reserve();
 }
 
 //********************************************************************************
@@ -424,15 +513,20 @@ inline bool is_graph_active(graph& g) {
     return g.my_is_active;
 }
 
+inline bool is_this_thread_in_graph_arena(graph& g) {
+    __TBB_ASSERT(g.my_task_arena && g.my_task_arena->is_active(), nullptr);
+    return r1::execution_slot(*g.my_task_arena) != d1::slot_id(-1);
+}
+
 inline graph_task* prioritize_task(graph& g, graph_task& gt) {
     if( no_priority == gt.priority )
         return &gt;
 
     //! Non-preemptive priority pattern. The original task is submitted as a work item to the
     //! priority queue, and a new critical task is created to take and execute a work item with
-    //! the highest known priority. The reference counting responsibility is transferred (via
-    //! allocate_continuation) to the new task.
-    task* critical_task = gt.my_allocator.new_object<priority_task_selector>(g.my_priority_queue, gt.my_allocator);
+    //! the highest known priority. The reference counting responsibility is transferred to
+    //! the new task.
+    d1::task* critical_task = gt.my_allocator.new_object<priority_task_selector>(g.my_priority_queue, gt.my_allocator);
     __TBB_ASSERT( critical_task, "bad_alloc?" );
     g.my_priority_queue.push(&gt);
     using tbb::detail::d1::submit;
@@ -443,7 +537,7 @@ inline graph_task* prioritize_task(graph& g, graph_task& gt) {
 //! Spawns a task inside graph arena
 inline void spawn_in_graph_arena(graph& g, graph_task& arena_task) {
     if (is_graph_active(g)) {
-        task* gt = prioritize_task(g, arena_task);
+        d1::task* gt = prioritize_task(g, arena_task);
         if( !gt )
             return;
 
@@ -464,12 +558,12 @@ inline void enqueue_in_graph_arena(graph &g, graph_task& arena_task) {
         __TBB_ASSERT( g.my_task_arena && g.my_task_arena->is_active(), "Is graph's arena initialized and active?" );
 
         // TODO revamp: decide on the approach that does not postpone critical task
-        if( task* gt = prioritize_task(g, arena_task) )
+        if( d1::task* gt = prioritize_task(g, arena_task) )
             submit( *gt, *g.my_task_arena, *g.my_context, /*as_critical=*/false);
     }
 }
 
-} // namespace d1
+} // namespace d2
 } // namespace detail
 } // namespace tbb
 
diff --git a/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_indexer_impl.h b/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_indexer_impl.h
index f4f55a6c7a..a743310079 100644
--- a/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_indexer_impl.h
+++ b/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_indexer_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #error Do not #include this internal file directly; use public TBB headers instead.
 #endif
 
-// included in namespace tbb::detail::d1
+// included in namespace tbb::detail::d2
 
 #include "_flow_graph_types_impl.h"
 
@@ -31,9 +31,9 @@
     // successor.
 
     template<typename IndexerNodeBaseType, typename T, size_t K>
-    graph_task* do_try_put(const T &v, void *p) {
+    graph_task* do_try_put(const T &v, void *p __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) {
         typename IndexerNodeBaseType::output_type o(K, v);
-        return reinterpret_cast<IndexerNodeBaseType *>(p)->try_put_task(&o);
+        return reinterpret_cast<IndexerNodeBaseType *>(p)->try_put_task(&o __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
     }
 
     template<typename TupleTypes,int N>
@@ -41,7 +41,7 @@
         template<typename IndexerNodeBaseType, typename PortTuple>
         static inline void set_indexer_node_pointer(PortTuple &my_input, IndexerNodeBaseType *p, graph& g) {
             typedef typename std::tuple_element<N-1, TupleTypes>::type T;
-            graph_task* (*indexer_node_put_task)(const T&, void *) = do_try_put<IndexerNodeBaseType, T, N-1>;
+            auto indexer_node_put_task = do_try_put<IndexerNodeBaseType, T, N-1>;
             std::get<N-1>(my_input).set_up(p, indexer_node_put_task, g);
             indexer_helper<TupleTypes,N-1>::template set_indexer_node_pointer<IndexerNodeBaseType,PortTuple>(my_input, p, g);
         }
@@ -52,7 +52,7 @@
         template<typename IndexerNodeBaseType, typename PortTuple>
         static inline void set_indexer_node_pointer(PortTuple &my_input, IndexerNodeBaseType *p, graph& g) {
             typedef typename std::tuple_element<0, TupleTypes>::type T;
-            graph_task* (*indexer_node_put_task)(const T&, void *) = do_try_put<IndexerNodeBaseType, T, 0>;
+            auto indexer_node_put_task = do_try_put<IndexerNodeBaseType, T, 0>;
             std::get<0>(my_input).set_up(p, indexer_node_put_task, g);
         }
     };
@@ -61,7 +61,8 @@
     class indexer_input_port : public receiver<T> {
     private:
         void* my_indexer_ptr;
-        typedef graph_task* (* forward_function_ptr)(T const &, void* );
+        typedef graph_task* (* forward_function_ptr)(T const &, void*
+                                                     __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo&));
         forward_function_ptr my_try_put_task;
         graph* my_graph;
     public:
@@ -76,9 +77,15 @@
         template<typename X, typename Y> friend class broadcast_cache;
         template<typename X, typename Y> friend class round_robin_cache;
         graph_task* try_put_task(const T &v) override {
-            return my_try_put_task(v, my_indexer_ptr);
+            return my_try_put_task(v, my_indexer_ptr __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        graph_task* try_put_task(const T& v, const message_metainfo& metainfo) override {
+            return my_try_put_task(v, my_indexer_ptr, metainfo);
+        }
+#endif
+
         graph& graph_reference() const override {
             return *my_graph;
         }
@@ -118,7 +125,7 @@
         };
         typedef indexer_node_base<InputTuple,output_type,StructTypes> class_type;
 
-        class indexer_node_base_operation : public aggregated_operation<indexer_node_base_operation> {
+        class indexer_node_base_operation : public d1::aggregated_operation<indexer_node_base_operation> {
         public:
             char type;
             union {
@@ -126,15 +133,23 @@
                 successor_type *my_succ;
                 graph_task* bypass_t;
             };
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            message_metainfo const* metainfo;
+#endif
             indexer_node_base_operation(const output_type* e, op_type t) :
-                type(char(t)), my_arg(e) {}
+                type(char(t)), my_arg(e) __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(nullptr))
+            {}
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            indexer_node_base_operation(const output_type* e, op_type t, const message_metainfo& info)
+                : type(char(t)), my_arg(e), metainfo(&info) {}
+#endif
             indexer_node_base_operation(const successor_type &s, op_type t) : type(char(t)),
                 my_succ(const_cast<successor_type *>(&s)) {}
         };
 
-        typedef aggregating_functor<class_type, indexer_node_base_operation> handler_type;
-        friend class aggregating_functor<class_type, indexer_node_base_operation>;
-        aggregator<handler_type, indexer_node_base_operation> my_aggregator;
+        typedef d1::aggregating_functor<class_type, indexer_node_base_operation> handler_type;
+        friend class d1::aggregating_functor<class_type, indexer_node_base_operation>;
+        d1::aggregator<handler_type, indexer_node_base_operation> my_aggregator;
 
         void handle_operations(indexer_node_base_operation* op_list) {
             indexer_node_base_operation *current;
@@ -153,7 +168,8 @@
                     current->status.store( SUCCEEDED, std::memory_order_release);
                     break;
                 case try__put_task: {
-                        current->bypass_t = my_successors.try_put_task(*(current->my_arg));
+                        current->bypass_t = my_successors.try_put_task(*(current->my_arg)
+                                                                       __TBB_FLOW_GRAPH_METAINFO_ARG(*(current->metainfo)));
                         current->status.store( SUCCEEDED, std::memory_order_release);  // return of try_put_task actual return value
                     }
                     break;
@@ -186,8 +202,11 @@
             return op_data.status == SUCCEEDED;
         }
 
-        graph_task* try_put_task(output_type const *v) { // not a virtual method in this class
-            indexer_node_base_operation op_data(v, try__put_task);
+        // not a virtual method in this class
+        graph_task* try_put_task(output_type const *v
+                                 __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo))
+        {
+            indexer_node_base_operation op_data(v, try__put_task __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
             my_aggregator.execute(&op_data);
             return op_data.bypass_t;
         }
diff --git a/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_item_buffer_impl.h b/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_item_buffer_impl.h
index 423033b1d5..cf7c54b852 100644
--- a/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_item_buffer_impl.h
+++ b/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_item_buffer_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -37,8 +37,14 @@ class item_buffer {
     typedef T item_type;
     enum buffer_item_state { no_item=0, has_item=1, reserved_item=2 };
 protected:
+    struct aligned_space_item {
+        item_type item;
+        buffer_item_state state;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        message_metainfo metainfo;
+#endif
+    };
     typedef size_t size_type;
-    typedef std::pair<item_type, buffer_item_state> aligned_space_item;
     typedef aligned_space<aligned_space_item> buffer_item_type;
     typedef typename allocator_traits<A>::template rebind_alloc<buffer_item_type> allocator_type;
     buffer_item_type *my_array;
@@ -49,45 +55,89 @@ class item_buffer {
 
     bool buffer_empty() const { return my_head == my_tail; }
 
-    aligned_space_item &item(size_type i) {
-        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->second))%alignment_of<buffer_item_state>::value), nullptr);
-        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->first))%alignment_of<item_type>::value), nullptr);
+    aligned_space_item &element(size_type i) {
+        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->state))%alignment_of<buffer_item_state>::value), nullptr);
+        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->item))%alignment_of<item_type>::value), nullptr);
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->metainfo))%alignment_of<message_metainfo>::value), nullptr);
+#endif
         return *my_array[i & (my_array_size - 1) ].begin();
     }
 
-    const aligned_space_item &item(size_type i) const {
-        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->second))%alignment_of<buffer_item_state>::value), nullptr);
-        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->first))%alignment_of<item_type>::value), nullptr);
+    const aligned_space_item &element(size_type i) const {
+        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->state))%alignment_of<buffer_item_state>::value), nullptr);
+        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->item))%alignment_of<item_type>::value), nullptr);
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->metainfo))%alignment_of<message_metainfo>::value), nullptr);
+#endif
         return *my_array[i & (my_array_size-1)].begin();
     }
 
-    bool my_item_valid(size_type i) const { return (i < my_tail) && (i >= my_head) && (item(i).second != no_item); }
+    bool my_item_valid(size_type i) const { return (i < my_tail) && (i >= my_head) && (element(i).state != no_item); }
 #if TBB_USE_ASSERT
-    bool my_item_reserved(size_type i) const { return item(i).second == reserved_item; }
+    bool my_item_reserved(size_type i) const { return element(i).state == reserved_item; }
 #endif
 
     // object management in buffer
     const item_type &get_my_item(size_t i) const {
         __TBB_ASSERT(my_item_valid(i),"attempt to get invalid item");
-        item_type* itm = const_cast<item_type*>(reinterpret_cast<const item_type*>(&item(i).first));
-        return *itm;
+        return element(i).item;
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    message_metainfo& get_my_metainfo(size_t i) {
+        __TBB_ASSERT(my_item_valid(i), "attempt to get invalid item");
+        return element(i).metainfo;
+    }
+#endif
+
     // may be called with an empty slot or a slot that has already been constructed into.
-    void set_my_item(size_t i, const item_type &o) {
-        if(item(i).second != no_item) {
+    void set_my_item(size_t i, const item_type &o
+                     __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo))
+    {
+        if(element(i).state != no_item) {
             destroy_item(i);
         }
-        new(&(item(i).first)) item_type(o);
-        item(i).second = has_item;
+        new(&(element(i).item)) item_type(o);
+        element(i).state = has_item;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        new(&element(i).metainfo) message_metainfo(metainfo);
+
+        for (auto& waiter : metainfo.waiters()) {
+            waiter->reserve(1);
+        }
+#endif
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    void set_my_item(size_t i, const item_type& o, message_metainfo&& metainfo) {
+        if(element(i).state != no_item) {
+            destroy_item(i);
+        }
+
+        new(&(element(i).item)) item_type(o);
+        new(&element(i).metainfo) message_metainfo(std::move(metainfo));
+        // Skipping the reservation on metainfo.waiters since the ownership
+        // is moving from metainfo to the cache
+        element(i).state = has_item;
     }
+#endif
 
     // destructively-fetch an object from the buffer
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    void fetch_item(size_t i, item_type& o, message_metainfo& metainfo) {
+        __TBB_ASSERT(my_item_valid(i), "Trying to fetch an empty slot");
+        o = get_my_item(i);  // could have std::move assign semantics
+        metainfo = std::move(get_my_metainfo(i));
+        destroy_item(i);
+    }
+#else
     void fetch_item(size_t i, item_type &o) {
         __TBB_ASSERT(my_item_valid(i), "Trying to fetch an empty slot");
         o = get_my_item(i);  // could have std::move assign semantics
         destroy_item(i);
     }
+#endif
 
     // move an existing item from one slot to another.  The moved-to slot must be unoccupied,
     // the moved-from slot must exist and not be reserved.  The after, from will be empty,
@@ -95,12 +145,22 @@ class item_buffer {
     void move_item(size_t to, size_t from) {
         __TBB_ASSERT(!my_item_valid(to), "Trying to move to a non-empty slot");
         __TBB_ASSERT(my_item_valid(from), "Trying to move from an empty slot");
-        set_my_item(to, get_my_item(from));   // could have std::move semantics
+        // could have std::move semantics
+        set_my_item(to, get_my_item(from) __TBB_FLOW_GRAPH_METAINFO_ARG(get_my_metainfo(from)));
         destroy_item(from);
-
     }
 
     // put an item in an empty slot.  Return true if successful, else false
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    template <typename Metainfo>
+    bool place_item(size_t here, const item_type &me, Metainfo&& metainfo) {
+#if !TBB_DEPRECATED_SEQUENCER_DUPLICATES
+        if(my_item_valid(here)) return false;
+#endif
+        set_my_item(here, me, std::forward<Metainfo>(metainfo));
+        return true;
+    }
+#else
     bool place_item(size_t here, const item_type &me) {
 #if !TBB_DEPRECATED_SEQUENCER_DUPLICATES
         if(my_item_valid(here)) return false;
@@ -108,19 +168,36 @@ class item_buffer {
         set_my_item(here, me);
         return true;
     }
+#endif
 
     // could be implemented with std::move semantics
     void swap_items(size_t i, size_t j) {
         __TBB_ASSERT(my_item_valid(i) && my_item_valid(j), "attempt to swap invalid item(s)");
         item_type temp = get_my_item(i);
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        message_metainfo temp_metainfo = get_my_metainfo(i);
+        set_my_item(i, get_my_item(j), get_my_metainfo(j));
+        set_my_item(j, temp, temp_metainfo);
+#else
         set_my_item(i, get_my_item(j));
         set_my_item(j, temp);
+#endif
     }
 
     void destroy_item(size_type i) {
         __TBB_ASSERT(my_item_valid(i), "destruction of invalid item");
-        item(i).first.~item_type();
-        item(i).second = no_item;
+
+        auto& e = element(i);
+        e.item.~item_type();
+        e.state = no_item;
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        for (auto& msg_waiter : e.metainfo.waiters()) {
+            msg_waiter->release(1);
+        }
+
+        e.metainfo.~message_metainfo();
+#endif
     }
 
     // returns the front element
@@ -130,6 +207,14 @@ class item_buffer {
         return get_my_item(my_head);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    const message_metainfo& front_metainfo() const
+    {
+        __TBB_ASSERT(my_item_valid(my_head), "attempt to fetch head non-item");
+        return element(my_head).metainfo;
+    }
+#endif
+
     // returns  the back element
     const item_type& back() const
     {
@@ -137,9 +222,23 @@ class item_buffer {
         return get_my_item(my_tail - 1);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    const message_metainfo& back_metainfo() const {
+        __TBB_ASSERT(my_item_valid(my_tail - 1), "attempt to fetch head non-item");
+        return element(my_tail - 1).metainfo;
+    }
+#endif
+
     // following methods are for reservation of the front of a buffer.
-    void reserve_item(size_type i) { __TBB_ASSERT(my_item_valid(i) && !my_item_reserved(i), "item cannot be reserved"); item(i).second = reserved_item; }
-    void release_item(size_type i) { __TBB_ASSERT(my_item_reserved(i), "item is not reserved"); item(i).second = has_item; }
+    void reserve_item(size_type i) {
+        __TBB_ASSERT(my_item_valid(i) && !my_item_reserved(i), "item cannot be reserved");
+        element(i).state = reserved_item;
+    }
+
+    void release_item(size_type i) {
+        __TBB_ASSERT(my_item_reserved(i), "item is not reserved");
+        element(i).state = has_item;
+    }
 
     void destroy_front() { destroy_item(my_head); ++my_head; }
     void destroy_back() { destroy_item(my_tail-1); --my_tail; }
@@ -163,14 +262,18 @@ class item_buffer {
         buffer_item_type* new_array = allocator_type().allocate(new_size);
 
         // initialize validity to "no"
-        for( size_type i=0; i<new_size; ++i ) { new_array[i].begin()->second = no_item; }
+        for( size_type i=0; i<new_size; ++i ) { new_array[i].begin()->state = no_item; }
 
         for( size_type i=my_head; i<my_tail; ++i) {
             if(my_item_valid(i)) {  // sequencer_node may have empty slots
                 // placement-new copy-construct; could be std::move
-                char *new_space = (char *)&(new_array[i&(new_size-1)].begin()->first);
+                char *new_space = (char *)&(new_array[i&(new_size-1)].begin()->item);
                 (void)new(new_space) item_type(get_my_item(i));
-                new_array[i&(new_size-1)].begin()->second = item(i).second;
+                new_array[i&(new_size-1)].begin()->state = element(i).state;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                char* meta_space = (char *)&(new_array[i&(new_size-1)].begin()->metainfo);
+                ::new(meta_space) message_metainfo(std::move(element(i).metainfo));
+#endif
             }
         }
 
@@ -180,33 +283,61 @@ class item_buffer {
         my_array_size = new_size;
     }
 
-    bool push_back(item_type &v) {
-        if(buffer_full()) {
+    bool push_back(item_type& v
+                   __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo))
+    {
+        if (buffer_full()) {
             grow_my_array(size() + 1);
         }
-        set_my_item(my_tail, v);
+        set_my_item(my_tail, v __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
         ++my_tail;
         return true;
     }
 
-    bool pop_back(item_type &v) {
-        if (!my_item_valid(my_tail-1)) {
+    bool pop_back(item_type& v
+                  __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo& metainfo))
+    {
+        if (!my_item_valid(my_tail - 1)) {
             return false;
         }
-        v = this->back();
+        auto& e = element(my_tail - 1);
+        v = e.item;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        metainfo = std::move(e.metainfo);
+#endif
+
         destroy_back();
         return true;
     }
 
-    bool pop_front(item_type &v) {
-        if(!my_item_valid(my_head)) {
+    bool pop_front(item_type& v
+                   __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo& metainfo))
+    {
+        if (!my_item_valid(my_head)) {
             return false;
         }
-        v = this->front();
+        auto& e = element(my_head);
+        v = e.item;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        metainfo = std::move(e.metainfo);
+#endif
+
         destroy_front();
         return true;
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    bool pop_back(item_type& v) {
+        message_metainfo metainfo;
+        return pop_back(v, metainfo);
+    }
+
+    bool pop_front(item_type& v) {
+        message_metainfo metainfo;
+        return pop_front(v, metainfo);
+    }
+#endif
+
     // This is used both for reset and for grow_my_array.  In the case of grow_my_array
     // we want to retain the values of the head and tail.
     void clean_up_buffer(bool reset_pointers) {
@@ -261,6 +392,18 @@ class reservable_item_buffer : public item_buffer<T, A> {
         return true;
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    bool reserve_front(T& v, message_metainfo& metainfo) {
+        if (my_reserved || !my_item_valid(this->my_head)) return false;
+        my_reserved = true;
+        // reserving the head
+        v = this->front();
+        metainfo = this->front_metainfo();
+        this->reserve_item(this->my_head);
+        return true;
+    }
+#endif
+
     void consume_front() {
         __TBB_ASSERT(my_reserved, "Attempt to consume a non-reserved item");
         this->destroy_front();
diff --git a/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_join_impl.h b/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_join_impl.h
index 5515421ede..8bca9a2c41 100644
--- a/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_join_impl.h
+++ b/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_join_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #error Do not #include this internal file directly; use public TBB headers instead.
 #endif
 
-// included into namespace tbb::detail::d1
+// included into namespace tbb::detail::d2
 
     struct forwarding_base : no_assign {
         forwarding_base(graph &g) : graph_ref(g) {}
@@ -89,17 +89,49 @@
             return true;
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        template <typename InputTuple, typename OutputTuple>
+        static inline bool reserve(InputTuple& my_input, OutputTuple& out, message_metainfo& metainfo) {
+            message_metainfo element_metainfo;
+            if (!std::get<N - 1>(my_input).reserve(std::get<N - 1>(out), element_metainfo)) return false;
+            if (!join_helper<N - 1>::reserve(my_input, out, metainfo)) {
+                release_my_reservation(my_input);
+                return false;
+            }
+            metainfo.merge(element_metainfo);
+            return true;
+
+        }
+#endif
+
         template<typename InputTuple, typename OutputTuple>
         static inline bool get_my_item( InputTuple &my_input, OutputTuple &out) {
             bool res = std::get<N-1>(my_input).get_item(std::get<N-1>(out) ); // may fail
             return join_helper<N-1>::get_my_item(my_input, out) && res;       // do get on other inputs before returning
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        template <typename InputTuple, typename OutputTuple>
+        static inline bool get_my_item(InputTuple& my_input, OutputTuple& out, message_metainfo& metainfo) {
+            message_metainfo element_metainfo;
+            bool res = std::get<N-1>(my_input).get_item(std::get<N-1>(out), element_metainfo);
+            metainfo.merge(element_metainfo);
+            return join_helper<N-1>::get_my_item(my_input, out, metainfo) && res;
+        }
+#endif
+
         template<typename InputTuple, typename OutputTuple>
         static inline bool get_items(InputTuple &my_input, OutputTuple &out) {
             return get_my_item(my_input, out);
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        template <typename InputTuple, typename OutputTuple>
+        static inline bool get_items(InputTuple& my_input, OutputTuple& out, message_metainfo& metainfo) {
+            return get_my_item(my_input, out, metainfo);
+        }
+#endif
+
         template<typename InputTuple>
         static inline void reset_my_port(InputTuple &my_input) {
             join_helper<N-1>::reset_my_port(my_input);
@@ -163,16 +195,43 @@
             return std::get<0>( my_input ).reserve( std::get<0>( out ) );
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        template <typename InputTuple, typename OutputTuple>
+        static inline bool reserve(InputTuple& my_input, OutputTuple& out, message_metainfo& metainfo) {
+            message_metainfo element_metainfo;
+            bool result = std::get<0>(my_input).reserve(std::get<0>(out), element_metainfo);
+            metainfo.merge(element_metainfo);
+            return result;
+        }
+#endif
+
         template<typename InputTuple, typename OutputTuple>
         static inline bool get_my_item( InputTuple &my_input, OutputTuple &out) {
             return std::get<0>(my_input).get_item(std::get<0>(out));
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        template <typename InputTuple, typename OutputTuple>
+        static inline bool get_my_item(InputTuple& my_input, OutputTuple& out, message_metainfo& metainfo) {
+            message_metainfo element_metainfo;
+            bool res = std::get<0>(my_input).get_item(std::get<0>(out), element_metainfo);
+            metainfo.merge(element_metainfo);
+            return res;
+        }
+#endif
+
         template<typename InputTuple, typename OutputTuple>
         static inline bool get_items(InputTuple &my_input, OutputTuple &out) {
             return get_my_item(my_input, out);
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        template <typename InputTuple, typename OutputTuple>
+        static inline bool get_items(InputTuple& my_input, OutputTuple& out, message_metainfo& metainfo) {
+            return get_my_item(my_input, out, metainfo);
+        }
+#endif
+
         template<typename InputTuple>
         static inline void reset_my_port(InputTuple &my_input) {
             std::get<0>(my_input).reset_port();
@@ -216,23 +275,31 @@
         };
         typedef reserving_port<T> class_type;
 
-        class reserving_port_operation : public aggregated_operation<reserving_port_operation> {
+        class reserving_port_operation : public d1::aggregated_operation<reserving_port_operation> {
         public:
             char type;
             union {
                 T *my_arg;
                 predecessor_type *my_pred;
             };
-            reserving_port_operation(const T& e, op_type t) :
-                type(char(t)), my_arg(const_cast<T*>(&e)) {}
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            message_metainfo* metainfo;
+#endif
+            reserving_port_operation(const T& e, op_type t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo& info)) :
+                type(char(t)), my_arg(const_cast<T*>(&e))
+                __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(&info)) {}
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            reserving_port_operation(const T& e, op_type t)
+                : type(char(t)), my_arg(const_cast<T*>(&e)), metainfo(nullptr) {}
+#endif
             reserving_port_operation(const predecessor_type &s, op_type t) : type(char(t)),
                 my_pred(const_cast<predecessor_type *>(&s)) {}
             reserving_port_operation(op_type t) : type(char(t)) {}
         };
 
-        typedef aggregating_functor<class_type, reserving_port_operation> handler_type;
-        friend class aggregating_functor<class_type, reserving_port_operation>;
-        aggregator<handler_type, reserving_port_operation> my_aggregator;
+        typedef d1::aggregating_functor<class_type, reserving_port_operation> handler_type;
+        friend class d1::aggregating_functor<class_type, reserving_port_operation>;
+        d1::aggregator<handler_type, reserving_port_operation> my_aggregator;
 
         void handle_operations(reserving_port_operation* op_list) {
             reserving_port_operation *current;
@@ -262,14 +329,26 @@
                     if ( reserved ) {
                         current->status.store( FAILED, std::memory_order_release);
                     }
-                    else if ( my_predecessors.try_reserve( *(current->my_arg) ) ) {
-                        reserved = true;
-                        current->status.store( SUCCEEDED, std::memory_order_release);
-                    } else {
-                        if ( my_predecessors.empty() ) {
-                            my_join->increment_port_count();
+                    else {
+                        bool reserve_result = false;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                        if (current->metainfo) {
+                            reserve_result = my_predecessors.try_reserve(*(current->my_arg),
+                                                                         *(current->metainfo));
+                        } else
+#endif
+                        {
+                            reserve_result = my_predecessors.try_reserve(*(current->my_arg));
+                        }
+                        if (reserve_result) {
+                            reserved = true;
+                            current->status.store( SUCCEEDED, std::memory_order_release);
+                        } else {
+                            if ( my_predecessors.empty() ) {
+                                my_join->increment_port_count();
+                            }
+                            current->status.store( FAILED, std::memory_order_release);
                         }
-                        current->status.store( FAILED, std::memory_order_release);
                     }
                     break;
                 case rel_res:
@@ -294,6 +373,10 @@
             return nullptr;
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task(const T&, const message_metainfo&) override { return nullptr; }
+#endif
+
         graph& graph_reference() const override {
             return my_join->graph_ref;
         }
@@ -333,6 +416,14 @@
             return op_data.status == SUCCEEDED;
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        bool reserve( T& v, message_metainfo& metainfo ) {
+            reserving_port_operation op_data(v, res_item, metainfo);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+#endif
+
         //! Release the port
         void release( ) {
             reserving_port_operation op_data(rel_res);
@@ -376,31 +467,42 @@
         enum op_type { get__item, res_port, try__put_task
         };
 
-        class queueing_port_operation : public aggregated_operation<queueing_port_operation> {
+        class queueing_port_operation : public d1::aggregated_operation<queueing_port_operation> {
         public:
             char type;
             T my_val;
             T* my_arg;
             graph_task* bypass_t;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            message_metainfo* metainfo;
+#endif
             // constructor for value parameter
-            queueing_port_operation(const T& e, op_type t) :
-                type(char(t)), my_val(e), my_arg(nullptr)
+            queueing_port_operation(const T& e, op_type t __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& info))
+                : type(char(t)), my_val(e), my_arg(nullptr)
                 , bypass_t(nullptr)
+                __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(const_cast<message_metainfo*>(&info)))
             {}
             // constructor for pointer parameter
-            queueing_port_operation(const T* p, op_type t) :
+            queueing_port_operation(const T* p, op_type t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo& info)) :
                 type(char(t)), my_arg(const_cast<T*>(p))
                 , bypass_t(nullptr)
+                __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(&info))
+            {}
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            queueing_port_operation(const T* p, op_type t)
+                : type(char(t)), my_arg(const_cast<T*>(p)), bypass_t(nullptr), metainfo(nullptr)
             {}
+#endif
             // constructor with no parameter
             queueing_port_operation(op_type t) : type(char(t)), my_arg(nullptr)
                 , bypass_t(nullptr)
+                __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(nullptr))
             {}
         };
 
-        typedef aggregating_functor<class_type, queueing_port_operation> handler_type;
-        friend class aggregating_functor<class_type, queueing_port_operation>;
-        aggregator<handler_type, queueing_port_operation> my_aggregator;
+        typedef d1::aggregating_functor<class_type, queueing_port_operation> handler_type;
+        friend class d1::aggregating_functor<class_type, queueing_port_operation>;
+        d1::aggregator<handler_type, queueing_port_operation> my_aggregator;
 
         void handle_operations(queueing_port_operation* op_list) {
             queueing_port_operation *current;
@@ -412,7 +514,12 @@
                 case try__put_task: {
                         graph_task* rtask = nullptr;
                         was_empty = this->buffer_empty();
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                        __TBB_ASSERT(current->metainfo, nullptr);
+                        this->push_back(current->my_val, *(current->metainfo));
+#else
                         this->push_back(current->my_val);
+#endif
                         if (was_empty) rtask = my_join->decrement_port_count(false);
                         else
                             rtask = SUCCESSFULLY_ENQUEUED;
@@ -424,6 +531,11 @@
                     if(!this->buffer_empty()) {
                         __TBB_ASSERT(current->my_arg, nullptr);
                         *(current->my_arg) = this->front();
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                        if (current->metainfo) {
+                            *(current->metainfo) = this->front_metainfo();
+                        }
+#endif
                         current->status.store( SUCCEEDED, std::memory_order_release);
                     }
                     else {
@@ -447,14 +559,27 @@
         template< typename R, typename B > friend class run_and_put_task;
         template<typename X, typename Y> friend class broadcast_cache;
         template<typename X, typename Y> friend class round_robin_cache;
-        graph_task* try_put_task(const T &v) override {
-            queueing_port_operation op_data(v, try__put_task);
+
+    private:
+        graph_task* try_put_task_impl(const T& v __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) {
+            queueing_port_operation op_data(v, try__put_task __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
             my_aggregator.execute(&op_data);
             __TBB_ASSERT(op_data.status == SUCCEEDED || !op_data.bypass_t, "inconsistent return from aggregator");
             if(!op_data.bypass_t) return SUCCESSFULLY_ENQUEUED;
             return op_data.bypass_t;
         }
 
+    protected:
+        graph_task* try_put_task(const T &v) override {
+            return try_put_task_impl(v __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
+        }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        graph_task* try_put_task(const T& v, const message_metainfo& metainfo) override {
+            return try_put_task_impl(v, metainfo);
+        }
+#endif
+
         graph& graph_reference() const override {
             return my_join->graph_ref;
         }
@@ -481,6 +606,14 @@
             return op_data.status == SUCCEEDED;
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        bool get_item( T& v, message_metainfo& metainfo ) {
+            queueing_port_operation op_data(&v, get__item, metainfo);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+#endif
+
         // reset_port is called when item is accepted by successor, but
         // is initiated by join_node.
         void reset_port() {
@@ -517,13 +650,23 @@
         const K& operator()(const table_item_type& v) { return v.my_key; }
     };
 
+    template <typename K, typename T, typename TtoK, typename KHash>
+    struct key_matching_port_base {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        using type = metainfo_hash_buffer<K, T, TtoK, KHash>;
+#else
+        using type = hash_buffer<K, T, TtoK, KHash>;
+#endif
+    };
+
     // the ports can have only one template parameter.  We wrap the types needed in
     // a traits type
     template< class TraitsType >
     class key_matching_port :
         public receiver<typename TraitsType::T>,
-        public hash_buffer< typename TraitsType::K, typename TraitsType::T, typename TraitsType::TtoK,
-                typename TraitsType::KHash > {
+        public key_matching_port_base< typename TraitsType::K, typename TraitsType::T, typename TraitsType::TtoK,
+                                       typename TraitsType::KHash >::type
+    {
     public:
         typedef TraitsType traits;
         typedef key_matching_port<traits> class_type;
@@ -533,7 +676,7 @@
         typedef typename receiver<input_type>::predecessor_type predecessor_type;
         typedef typename TraitsType::TtoK type_to_key_func_type;
         typedef typename TraitsType::KHash hash_compare_type;
-        typedef hash_buffer< key_type, input_type, type_to_key_func_type, hash_compare_type > buffer_type;
+        typedef typename key_matching_port_base<key_type, input_type, type_to_key_func_type, hash_compare_type>::type buffer_type;
 
     private:
 // ----------- Aggregator ------------
@@ -541,24 +684,33 @@
         enum op_type { try__put, get__item, res_port
         };
 
-        class key_matching_port_operation : public aggregated_operation<key_matching_port_operation> {
+        class key_matching_port_operation : public d1::aggregated_operation<key_matching_port_operation> {
         public:
             char type;
             input_type my_val;
             input_type *my_arg;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            message_metainfo* metainfo = nullptr;
+#endif
             // constructor for value parameter
-            key_matching_port_operation(const input_type& e, op_type t) :
-                type(char(t)), my_val(e), my_arg(nullptr) {}
+            key_matching_port_operation(const input_type& e, op_type t
+                                        __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& info))
+                : type(char(t)), my_val(e), my_arg(nullptr)
+                  __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(const_cast<message_metainfo*>(&info))) {}
+
             // constructor for pointer parameter
-            key_matching_port_operation(const input_type* p, op_type t) :
-                type(char(t)), my_arg(const_cast<input_type*>(p)) {}
+            key_matching_port_operation(const input_type* p, op_type t
+                                        __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo& info))
+                : type(char(t)), my_arg(const_cast<input_type*>(p))
+                  __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(&info)) {}
+
             // constructor with no parameter
             key_matching_port_operation(op_type t) : type(char(t)), my_arg(nullptr) {}
         };
 
-        typedef aggregating_functor<class_type, key_matching_port_operation> handler_type;
-        friend class aggregating_functor<class_type, key_matching_port_operation>;
-        aggregator<handler_type, key_matching_port_operation> my_aggregator;
+        typedef d1::aggregating_functor<class_type, key_matching_port_operation> handler_type;
+        friend class d1::aggregating_functor<class_type, key_matching_port_operation>;
+        d1::aggregator<handler_type, key_matching_port_operation> my_aggregator;
 
         void handle_operations(key_matching_port_operation* op_list) {
             key_matching_port_operation *current;
@@ -567,18 +719,35 @@
                 op_list = op_list->next;
                 switch(current->type) {
                 case try__put: {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                        __TBB_ASSERT(current->metainfo, nullptr);
+                        bool was_inserted = this->insert_with_key(current->my_val, *(current->metainfo));
+#else
                         bool was_inserted = this->insert_with_key(current->my_val);
+#endif
                         // return failure if a duplicate insertion occurs
                         current->status.store( was_inserted ? SUCCEEDED : FAILED, std::memory_order_release);
                     }
                     break;
-                case get__item:
+                case get__item: {
                     // use current_key from FE for item
                     __TBB_ASSERT(current->my_arg, nullptr);
-                    if(!this->find_with_key(my_join->current_key, *(current->my_arg))) {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                    __TBB_ASSERT(current->metainfo, nullptr);
+                    bool find_result = this->find_with_key(my_join->current_key, *(current->my_arg),
+                                                           *(current->metainfo));
+#else
+                    bool find_result = this->find_with_key(my_join->current_key, *(current->my_arg));
+#endif
+#if TBB_USE_DEBUG
+                    if (!find_result) {
                         __TBB_ASSERT(false, "Failed to find item corresponding to current_key.");
                     }
+#else
+                    tbb::detail::suppress_unused_warning(find_result);
+#endif
                     current->status.store( SUCCEEDED, std::memory_order_release);
+                    }
                     break;
                 case res_port:
                     // use current_key from FE for item
@@ -593,17 +762,28 @@
         template< typename R, typename B > friend class run_and_put_task;
         template<typename X, typename Y> friend class broadcast_cache;
         template<typename X, typename Y> friend class round_robin_cache;
-        graph_task* try_put_task(const input_type& v) override {
-            key_matching_port_operation op_data(v, try__put);
+    private:
+        graph_task* try_put_task_impl(const input_type& v __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) {
+            key_matching_port_operation op_data(v, try__put __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
             graph_task* rtask = nullptr;
             my_aggregator.execute(&op_data);
             if(op_data.status == SUCCEEDED) {
-                rtask = my_join->increment_key_count((*(this->get_key_func()))(v));  // may spawn
+                rtask = my_join->increment_key_count((*(this->get_key_func()))(v)); // may spawn
                 // rtask has to reflect the return status of the try_put
                 if(!rtask) rtask = SUCCESSFULLY_ENQUEUED;
             }
             return rtask;
         }
+    protected:
+        graph_task* try_put_task(const input_type& v) override {
+            return try_put_task_impl(v __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
+        }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        graph_task* try_put_task(const input_type& v, const message_metainfo& metainfo) override {
+            return try_put_task_impl(v, metainfo);
+        }
+#endif
 
         graph& graph_reference() const override {
             return my_join->graph_ref;
@@ -640,6 +820,15 @@
             return op_data.status == SUCCEEDED;
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        bool get_item( input_type& v, message_metainfo& metainfo ) {
+            // aggregator uses current_key from FE for Key
+            key_matching_port_operation op_data(&v, get__item, metainfo);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+#endif
+
         // reset_port is called when item is accepted by successor, but
         // is initiated by join_node.
         void reset_port() {
@@ -695,10 +884,9 @@
         graph_task* decrement_port_count() override {
             if(ports_with_no_inputs.fetch_sub(1) == 1) {
                 if(is_graph_active(this->graph_ref)) {
-                    small_object_allocator allocator{};
+                    d1::small_object_allocator allocator{};
                     typedef forward_task_bypass<base_node_type> task_type;
                     graph_task* t = allocator.new_object<task_type>(graph_ref, allocator, *my_node);
-                    graph_ref.reserve_wait();
                     spawn_in_graph_arena(this->graph_ref, *t);
                 }
             }
@@ -726,6 +914,13 @@
             return join_helper<N>::reserve(my_inputs, out);
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        bool try_to_make_tuple(output_type &out, message_metainfo& metainfo) {
+            if (ports_with_no_inputs) return false;
+            return join_helper<N>::reserve(my_inputs, out, metainfo);
+        }
+#endif
+
         void tuple_accepted() {
             join_helper<N>::consume_reservations(my_inputs);
         }
@@ -768,10 +963,9 @@
         {
             if(ports_with_no_items.fetch_sub(1) == 1) {
                 if(is_graph_active(this->graph_ref)) {
-                    small_object_allocator allocator{};
+                    d1::small_object_allocator allocator{};
                     typedef forward_task_bypass<base_node_type> task_type;
                     graph_task* t = allocator.new_object<task_type>(graph_ref, allocator, *my_node);
-                    graph_ref.reserve_wait();
                     if( !handle_task )
                         return t;
                     spawn_in_graph_arena(this->graph_ref, *t);
@@ -800,6 +994,13 @@
             return join_helper<N>::get_items(my_inputs, out);
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        bool try_to_make_tuple(output_type &out, message_metainfo& metainfo) {
+            if(ports_with_no_items) return false;
+            return join_helper<N>::get_items(my_inputs, out, metainfo);
+        }
+#endif
+
         void tuple_accepted() {
             reset_port_count();
             join_helper<N>::reset_ports(my_inputs);
@@ -854,23 +1055,30 @@
         enum op_type { res_count, inc_count, may_succeed, try_make };
         typedef join_node_FE<key_matching<key_type,key_hash_compare>, InputTuple, OutputTuple> class_type;
 
-        class key_matching_FE_operation : public aggregated_operation<key_matching_FE_operation> {
+        class key_matching_FE_operation : public d1::aggregated_operation<key_matching_FE_operation> {
         public:
             char type;
             unref_key_type my_val;
             output_type* my_output;
             graph_task* bypass_t;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            message_metainfo* metainfo = nullptr;
+#endif
             // constructor for value parameter
             key_matching_FE_operation(const unref_key_type& e , op_type t) : type(char(t)), my_val(e),
                  my_output(nullptr), bypass_t(nullptr) {}
             key_matching_FE_operation(output_type *p, op_type t) : type(char(t)), my_output(p), bypass_t(nullptr) {}
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            key_matching_FE_operation(output_type *p, op_type t, message_metainfo& info)
+                : type(char(t)), my_output(p), bypass_t(nullptr), metainfo(&info) {}
+#endif
             // constructor with no parameter
             key_matching_FE_operation(op_type t) : type(char(t)), my_output(nullptr), bypass_t(nullptr) {}
         };
 
-        typedef aggregating_functor<class_type, key_matching_FE_operation> handler_type;
-        friend class aggregating_functor<class_type, key_matching_FE_operation>;
-        aggregator<handler_type, key_matching_FE_operation> my_aggregator;
+        typedef d1::aggregating_functor<class_type, key_matching_FE_operation> handler_type;
+        friend class d1::aggregating_functor<class_type, key_matching_FE_operation>;
+        d1::aggregator<handler_type, key_matching_FE_operation> my_aggregator;
 
         // called from aggregator, so serialized
         // returns a task pointer if the a task would have been enqueued but we asked that
@@ -881,13 +1089,15 @@
             bool do_fwd = this->buffer_empty() && is_graph_active(this->graph_ref);
             this->current_key = t;
             this->delete_with_key(this->current_key);   // remove the key
-            if(join_helper<N>::get_items(my_inputs, l_out)) {  //  <== call back
-                this->push_back(l_out);
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            message_metainfo metainfo;
+#endif
+            if(join_helper<N>::get_items(my_inputs, l_out __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo))) {  //  <== call back
+                this->push_back(l_out __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
                 if(do_fwd) {  // we enqueue if receiving an item from predecessor, not if successor asks for item
-                    small_object_allocator allocator{};
+                    d1::small_object_allocator allocator{};
                     typedef forward_task_bypass<base_node_type> task_type;
                     rtask = allocator.new_object<task_type>(this->graph_ref, allocator, *my_node);
-                    this->graph_ref.reserve_wait();
                     do_fwd = false;
                 }
                 // retire the input values
@@ -937,6 +1147,11 @@
                     }
                     else {
                         *(current->my_output) = this->front();
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                        if (current->metainfo) {
+                            *(current->metainfo) = this->front_metainfo();
+                        }
+#endif
                         current->status.store( SUCCEEDED, std::memory_order_release);
                     }
                     break;
@@ -1010,6 +1225,14 @@
             return op_data.status == SUCCEEDED;
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        bool try_to_make_tuple(output_type &out, message_metainfo& metainfo) {
+            key_matching_FE_operation op_data(&out, try_make, metainfo);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+#endif
+
         void tuple_accepted() {
             reset_port_count();  // reset current_key after ports reset.
         }
@@ -1044,7 +1267,7 @@
         };
         typedef join_node_base<JP,InputTuple,OutputTuple> class_type;
 
-        class join_node_base_operation : public aggregated_operation<join_node_base_operation> {
+        class join_node_base_operation : public d1::aggregated_operation<join_node_base_operation> {
         public:
             char type;
             union {
@@ -1052,17 +1275,25 @@
                 successor_type *my_succ;
             };
             graph_task* bypass_t;
-            join_node_base_operation(const output_type& e, op_type t) : type(char(t)),
-                my_arg(const_cast<output_type*>(&e)), bypass_t(nullptr) {}
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            message_metainfo* metainfo;
+#endif
+            join_node_base_operation(const output_type& e, op_type t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo& info))
+                : type(char(t)), my_arg(const_cast<output_type*>(&e)), bypass_t(nullptr)
+                  __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(&info)) {}
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            join_node_base_operation(const output_type& e, op_type t)
+                : type(char(t)), my_arg(const_cast<output_type*>(&e)), bypass_t(nullptr), metainfo(nullptr) {}
+#endif
             join_node_base_operation(const successor_type &s, op_type t) : type(char(t)),
                 my_succ(const_cast<successor_type *>(&s)), bypass_t(nullptr) {}
             join_node_base_operation(op_type t) : type(char(t)), bypass_t(nullptr) {}
         };
 
-        typedef aggregating_functor<class_type, join_node_base_operation> handler_type;
-        friend class aggregating_functor<class_type, join_node_base_operation>;
+        typedef d1::aggregating_functor<class_type, join_node_base_operation> handler_type;
+        friend class d1::aggregating_functor<class_type, join_node_base_operation>;
         bool forwarder_busy;
-        aggregator<handler_type, join_node_base_operation> my_aggregator;
+        d1::aggregator<handler_type, join_node_base_operation> my_aggregator;
 
         void handle_operations(join_node_base_operation* op_list) {
             join_node_base_operation *current;
@@ -1073,10 +1304,9 @@
                 case reg_succ: {
                         my_successors.register_successor(*(current->my_succ));
                         if(tuple_build_may_succeed() && !forwarder_busy && is_graph_active(my_graph)) {
-                            small_object_allocator allocator{};
+                            d1::small_object_allocator allocator{};
                             typedef forward_task_bypass< join_node_base<JP, InputTuple, OutputTuple> > task_type;
                             graph_task* t = allocator.new_object<task_type>(my_graph, allocator, *this);
-                            my_graph.reserve_wait();
                             spawn_in_graph_arena(my_graph, *t);
                             forwarder_busy = true;
                         }
@@ -1089,7 +1319,26 @@
                     break;
                 case try__get:
                     if(tuple_build_may_succeed()) {
-                        if(try_to_make_tuple(*(current->my_arg))) {
+                        bool make_tuple_result = false;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                        if (current->metainfo) {
+                            make_tuple_result = try_to_make_tuple(*(current->my_arg), *(current->metainfo));
+                        } else
+#endif
+                        {
+                            make_tuple_result = try_to_make_tuple(*(current->my_arg));
+                        }
+                        if(make_tuple_result) {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                            if (current->metainfo) {
+                                // Since elements would be removed from queues while calling to tuple_accepted
+                                // together with corresponding message_metainfo objects
+                                // we need to prolong the wait until the successor would create a task for removed elements
+                                for (auto waiter : current->metainfo->waiters()) {
+                                    waiter->reserve(1);
+                                }
+                            }
+#endif
                             tuple_accepted();
                             current->status.store( SUCCEEDED, std::memory_order_release);
                         }
@@ -1110,9 +1359,14 @@
                         // them from the input ports after forwarding is complete?
                         if(tuple_build_may_succeed()) {  // checks output queue of FE
                             do {
-                                build_succeeded = try_to_make_tuple(out);  // fetch front_end of queue
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                                message_metainfo metainfo;
+#endif
+                                // fetch front_end of queue
+                                build_succeeded = try_to_make_tuple(out __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
                                 if(build_succeeded) {
-                                    graph_task *new_task = my_successors.try_put_task(out);
+                                    graph_task *new_task =
+                                        my_successors.try_put_task(out __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
                                     last_task = combine_tasks(my_graph, last_task, new_task);
                                     if(new_task) {
                                         tuple_accepted();
@@ -1175,6 +1429,14 @@
             return op_data.status == SUCCEEDED;
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        bool try_get( output_type &v, message_metainfo& metainfo) override {
+            join_node_base_operation op_data(v, try__get, metainfo);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+#endif
+
     protected:
         void reset_node(reset_flags f) override {
             input_ports_type::reset(f);
diff --git a/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_node_impl.h b/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_node_impl.h
index b79c53ddbf..336cb069c6 100644
--- a/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_node_impl.h
+++ b/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_node_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -34,6 +34,12 @@ class function_input_queue : public item_buffer<T,A> {
         return this->item_buffer<T, A>::front();
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    const message_metainfo& front_metainfo() const {
+        return this->item_buffer<T,A>::front_metainfo();
+    }
+#endif
+
     void pop() {
         this->destroy_front();
     }
@@ -41,6 +47,12 @@ class function_input_queue : public item_buffer<T,A> {
     bool push( T& t ) {
         return this->push_back( t );
     }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    bool push( T& t, const message_metainfo& metainfo ) {
+        return this->push_back(t, metainfo);
+    }
+#endif
 };
 
 //! Input and scheduling for a function node that takes a type Input as input
@@ -87,11 +99,14 @@ class function_input_base : public receiver<Input>, no_assign {
     }
 
     graph_task* try_put_task( const input_type& t) override {
-        if ( my_is_no_throw )
-            return try_put_task_impl(t, has_policy<lightweight, Policy>());
-        else
-            return try_put_task_impl(t, std::false_type());
+        return try_put_task_base(t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task( const input_type& t, const message_metainfo& metainfo ) override {
+        return try_put_task_base(t, metainfo);
     }
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
 
     //! Adds src to the list of cached predecessors.
     bool register_predecessor( predecessor_type &src ) override {
@@ -148,9 +163,12 @@ class function_input_base : public receiver<Input>, no_assign {
 private:
 
     friend class apply_body_task_bypass< class_type, input_type >;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    friend class apply_body_task_bypass< class_type, input_type, trackable_messages_graph_task >;
+#endif
     friend class forward_task_bypass< class_type >;
 
-    class operation_type : public aggregated_operation< operation_type > {
+    class operation_type : public d1::aggregated_operation< operation_type > {
     public:
         char type;
         union {
@@ -158,31 +176,49 @@ class function_input_base : public receiver<Input>, no_assign {
             predecessor_type *r;
         };
         graph_task* bypass_t;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        message_metainfo* metainfo;
+#endif
         operation_type(const input_type& e, op_type t) :
-            type(char(t)), elem(const_cast<input_type*>(&e)), bypass_t(nullptr) {}
+            type(char(t)), elem(const_cast<input_type*>(&e)), bypass_t(nullptr)
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            , metainfo(nullptr)
+#endif
+        {}
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        operation_type(const input_type& e, op_type t, const message_metainfo& info) :
+            type(char(t)), elem(const_cast<input_type*>(&e)), bypass_t(nullptr),
+            metainfo(const_cast<message_metainfo*>(&info)) {}
+#endif
         operation_type(op_type t) : type(char(t)), r(nullptr), bypass_t(nullptr) {}
     };
 
     bool forwarder_busy;
-    typedef aggregating_functor<class_type, operation_type> handler_type;
-    friend class aggregating_functor<class_type, operation_type>;
-    aggregator< handler_type, operation_type > my_aggregator;
+    typedef d1::aggregating_functor<class_type, operation_type> handler_type;
+    friend class d1::aggregating_functor<class_type, operation_type>;
+    d1::aggregator< handler_type, operation_type > my_aggregator;
 
     graph_task* perform_queued_requests() {
         graph_task* new_task = nullptr;
         if(my_queue) {
             if(!my_queue->empty()) {
                 ++my_concurrency;
-                new_task = create_body_task(my_queue->front());
+                // TODO: consider removing metainfo from the queue using move semantics to avoid
+                // ref counter increase
+                new_task = create_body_task(my_queue->front()
+                                            __TBB_FLOW_GRAPH_METAINFO_ARG(my_queue->front_metainfo()));
 
                 my_queue->pop();
             }
         }
         else {
             input_type i;
-            if(my_predecessors.get_item(i)) {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            message_metainfo metainfo;
+#endif
+            if(my_predecessors.get_item(i __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo))) {
                 ++my_concurrency;
-                new_task = create_body_task(i);
+                new_task = create_body_task(i __TBB_FLOW_GRAPH_METAINFO_ARG(std::move(metainfo)));
             }
         }
         return new_task;
@@ -233,10 +269,13 @@ class function_input_base : public receiver<Input>, no_assign {
         __TBB_ASSERT(my_max_concurrency != 0, nullptr);
         if (my_concurrency < my_max_concurrency) {
             ++my_concurrency;
-            graph_task * new_task = create_body_task(*(op->elem));
+            graph_task* new_task = create_body_task(*(op->elem)
+                                                    __TBB_FLOW_GRAPH_METAINFO_ARG(*(op->metainfo)));
             op->bypass_t = new_task;
             op->status.store(SUCCEEDED, std::memory_order_release);
-        } else if ( my_queue && my_queue->push(*(op->elem)) ) {
+        } else if ( my_queue && my_queue->push(*(op->elem)
+                    __TBB_FLOW_GRAPH_METAINFO_ARG(*(op->metainfo))) )
+        {
             op->bypass_t = SUCCESSFULLY_ENQUEUED;
             op->status.store(SUCCEEDED, std::memory_order_release);
         } else {
@@ -258,8 +297,10 @@ class function_input_base : public receiver<Input>, no_assign {
         }
     }
 
-    graph_task* internal_try_put_bypass( const input_type& t ) {
-        operation_type op_data(t, tryput_bypass);
+    graph_task* internal_try_put_bypass( const input_type& t
+                                         __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo))
+    {
+        operation_type op_data(t, tryput_bypass __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
         my_aggregator.execute(&op_data);
         if( op_data.status == SUCCEEDED ) {
             return op_data.bypass_t;
@@ -267,43 +308,75 @@ class function_input_base : public receiver<Input>, no_assign {
         return nullptr;
     }
 
-    graph_task* try_put_task_impl( const input_type& t, /*lightweight=*/std::true_type ) {
+    graph_task* try_put_task_base(const input_type& t
+                                  __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo))
+    {
+        if ( my_is_no_throw )
+            return try_put_task_impl(t, has_policy<lightweight, Policy>()
+                                     __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
+        else
+            return try_put_task_impl(t, std::false_type()
+                                     __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
+    }
+
+    graph_task* try_put_task_impl( const input_type& t, /*lightweight=*/std::true_type
+                                   __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo))
+    {
         if( my_max_concurrency == 0 ) {
-            return apply_body_bypass(t);
+            return apply_body_bypass(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
         } else {
             operation_type check_op(t, occupy_concurrency);
             my_aggregator.execute(&check_op);
             if( check_op.status == SUCCEEDED ) {
-                return apply_body_bypass(t);
+                return apply_body_bypass(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
             }
-            return internal_try_put_bypass(t);
+            return internal_try_put_bypass(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
         }
     }
 
-    graph_task* try_put_task_impl( const input_type& t, /*lightweight=*/std::false_type ) {
+    graph_task* try_put_task_impl( const input_type& t, /*lightweight=*/std::false_type
+                                   __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo))
+    {
         if( my_max_concurrency == 0 ) {
-            return create_body_task(t);
+            return create_body_task(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
         } else {
-            return internal_try_put_bypass(t);
+            return internal_try_put_bypass(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
         }
     }
 
     //! Applies the body to the provided input
     //  then decides if more work is available
-    graph_task* apply_body_bypass( const input_type &i ) {
-        return static_cast<ImplType *>(this)->apply_body_impl_bypass(i);
+    graph_task* apply_body_bypass( const input_type &i
+                                   __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo))
+
+    {
+        return static_cast<ImplType *>(this)->apply_body_impl_bypass(i __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
     }
 
     //! allocates a task to apply a body
-    graph_task* create_body_task( const input_type &input ) {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    template <typename Metainfo>
+    graph_task* create_body_task( const input_type &input, Metainfo&& metainfo )
+#else
+    graph_task* create_body_task( const input_type &input )
+#endif
+    {
         if (!is_graph_active(my_graph_ref)) {
             return nullptr;
         }
         // TODO revamp: extract helper for common graph task allocation part
-        small_object_allocator allocator{};
-        typedef apply_body_task_bypass<class_type, input_type> task_type;
-        graph_task* t = allocator.new_object<task_type>( my_graph_ref, allocator, *this, input, my_priority );
-        graph_reference().reserve_wait();
+        d1::small_object_allocator allocator{};
+        graph_task* t = nullptr;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        if (!metainfo.empty()) {
+            using task_type = apply_body_task_bypass<class_type, input_type, trackable_messages_graph_task>;
+            t = allocator.new_object<task_type>(my_graph_ref, allocator, *this, input, my_priority, std::forward<Metainfo>(metainfo));
+        } else
+#endif
+        {
+            using task_type = apply_body_task_bypass<class_type, input_type>;
+            t = allocator.new_object<task_type>(my_graph_ref, allocator, *this, input, my_priority);
+        }
         return t;
     }
 
@@ -327,10 +400,9 @@ class function_input_base : public receiver<Input>, no_assign {
         if (!is_graph_active(my_graph_ref)) {
             return nullptr;
         }
-        small_object_allocator allocator{};
+        d1::small_object_allocator allocator{};
         typedef forward_task_bypass<class_type> task_type;
         graph_task* t = allocator.new_object<task_type>( graph_reference(), allocator, *this, my_priority );
-        graph_reference().reserve_wait();
         return t;
     }
 
@@ -398,7 +470,9 @@ class function_input : public function_input_base<Input, Policy, A, function_inp
     }
 
     //TODO: consider moving into the base class
-    graph_task* apply_body_impl_bypass( const input_type &i) {
+    graph_task* apply_body_impl_bypass( const input_type &i
+                                        __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo))
+    {
         output_type v = apply_body_impl(i);
         graph_task* postponed_task = nullptr;
         if( base_type::my_max_concurrency != 0 ) {
@@ -410,7 +484,7 @@ class function_input : public function_input_base<Input, Policy, A, function_inp
             // execution policy
             spawn_in_graph_arena(base_type::graph_reference(), *postponed_task);
         }
-        graph_task* successor_task = successors().try_put_task(v);
+        graph_task* successor_task = successors().try_put_task(v __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
 #if _MSC_VER && !__INTEL_COMPILER
 #pragma warning (push)
 #pragma warning (disable: 4127)  /* suppress conditional expression is constant */
@@ -524,7 +598,9 @@ class multifunction_input : public function_input_base<Input, Policy, A, multifu
     // for multifunction nodes we do not have a single successor as such.  So we just tell
     // the task we were successful.
     //TODO: consider moving common parts with implementation in function_input into separate function
-    graph_task* apply_body_impl_bypass( const input_type &i ) {
+    graph_task* apply_body_impl_bypass( const input_type &i
+                                        __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo&) )
+    {
         fgt_begin_body( my_body );
         (*my_body)(i, my_output_ports);
         fgt_end_body( my_body );
@@ -578,6 +654,18 @@ struct emit_element {
         check_task_and_spawn(g, last_task);
         return emit_element<N-1>::emit_this(g,t,p);
     }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    template <typename TupleType, typename PortsType>
+    static graph_task* emit_this(graph& g, const TupleType& t, PortsType& p,
+                                 const message_metainfo& metainfo)
+    {
+        // TODO: consider to collect all the tasks in task_list and spawn them all at once
+        graph_task* last_task = std::get<N-1>(p).try_put_task(std::get<N-1>(t), metainfo);
+        check_task_and_spawn(g, last_task);
+        return emit_element<N-1>::emit_this(g, t, p, metainfo);
+    }
+#endif
 };
 
 template<>
@@ -588,6 +676,17 @@ struct emit_element<1> {
         check_task_and_spawn(g, last_task);
         return SUCCESSFULLY_ENQUEUED;
     }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    template <typename TupleType, typename PortsType>
+    static graph_task* emit_this(graph& g, const TupleType& t, PortsType& ports,
+                                 const message_metainfo& metainfo)
+    {
+        graph_task* last_task = std::get<0>(ports).try_put_task(std::get<0>(t), metainfo);
+        check_task_and_spawn(g, last_task);
+        return SUCCESSFULLY_ENQUEUED;
+    }
+#endif
 };
 
 //! Implements methods for an executable node that takes continue_msg as input
@@ -654,18 +753,25 @@ class continue_input : public continue_receiver {
     virtual broadcast_cache<output_type > &successors() = 0;
 
     friend class apply_body_task_bypass< class_type, continue_msg >;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    friend class apply_body_task_bypass< class_type, continue_msg, trackable_messages_graph_task >;
+#endif
 
     //! Applies the body to the provided input
-    graph_task* apply_body_bypass( input_type ) {
+    graph_task* apply_body_bypass( input_type __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo) ) {
         // There is an extra copied needed to capture the
         // body execution without the try_put
         fgt_begin_body( my_body );
         output_type v = (*my_body)( continue_msg() );
         fgt_end_body( my_body );
-        return successors().try_put_task( v );
+        return successors().try_put_task( v __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo) );
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* execute(const message_metainfo& metainfo) override {
+#else
     graph_task* execute() override {
+#endif
         if(!is_graph_active(my_graph_ref)) {
             return nullptr;
         }
@@ -677,13 +783,21 @@ class continue_input : public continue_receiver {
 #if _MSC_VER && !__INTEL_COMPILER
 #pragma warning (pop)
 #endif
-            return apply_body_bypass( continue_msg() );
+            return apply_body_bypass( continue_msg() __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo) );
         }
         else {
-            small_object_allocator allocator{};
-            typedef apply_body_task_bypass<class_type, continue_msg> task_type;
-            graph_task* t = allocator.new_object<task_type>( graph_reference(), allocator, *this, continue_msg(), my_priority );
-            graph_reference().reserve_wait();
+            d1::small_object_allocator allocator{};
+            graph_task* t = nullptr;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            if (!metainfo.empty()) {
+                using task_type = apply_body_task_bypass<class_type, continue_msg, trackable_messages_graph_task>;
+                t = allocator.new_object<task_type>( graph_reference(), allocator, *this, continue_msg(), my_priority, metainfo );
+            } else
+#endif
+            {
+                using task_type = apply_body_task_bypass<class_type, continue_msg>;
+                t = allocator.new_object<task_type>( graph_reference(), allocator, *this, continue_msg(), my_priority );
+            }
             return t;
         }
     }
@@ -755,6 +869,12 @@ class multifunction_output : public function_output<Output> {
         return my_successors.try_put_task(i);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task(const output_type& i, const message_metainfo& metainfo) {
+        return my_successors.try_put_task(i, metainfo);
+    }
+#endif
+
     template <int N> friend struct emit_element;
 
 };  // multifunction_output
diff --git a/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_node_set_impl.h b/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_node_set_impl.h
index ce867121f9..8440bd7008 100644
--- a/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_node_set_impl.h
+++ b/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_node_set_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2020-2021 Intel Corporation
+    Copyright (c) 2020-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #error Do not #include this internal file directly; use public TBB headers instead.
 #endif
 
-// Included in namespace tbb::detail::d1 (in flow_graph.h)
+// Included in namespace tbb::detail::d2 (in flow_graph.h)
 
 #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
 // Visual Studio 2019 reports an error while calling predecessor_selector::get and successor_selector::get
diff --git a/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_nodes_deduction.h b/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_nodes_deduction.h
index 8c20993795..47ecfb2a84 100644
--- a/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_nodes_deduction.h
+++ b/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_nodes_deduction.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 
 namespace tbb {
 namespace detail {
-namespace d1 {
+namespace d2 {
 
 template <typename Input, typename Output>
 struct declare_body_types {
@@ -51,10 +51,10 @@ template <typename T, typename Input, typename Output>
 struct body_types<Output (T::*)(Input&)> : declare_body_types<Input, Output> {};
 
 template <typename T, typename Output>
-struct body_types<Output (T::*)(flow_control&) const> : declare_body_types<NoInputBody, Output> {};
+struct body_types<Output (T::*)(d1::flow_control&) const> : declare_body_types<NoInputBody, Output> {};
 
 template <typename T, typename Output>
-struct body_types<Output (T::*)(flow_control&)> : declare_body_types<NoInputBody, Output> {};
+struct body_types<Output (T::*)(d1::flow_control&)> : declare_body_types<NoInputBody, Output> {};
 
 template <typename Input, typename Output>
 struct body_types<Output (*)(Input&)> : declare_body_types<Input, Output> {};
@@ -63,7 +63,7 @@ template <typename Input, typename Output>
 struct body_types<Output (*)(const Input&)> : declare_body_types<Input, Output> {};
 
 template <typename Output>
-struct body_types<Output (*)(flow_control&)> : declare_body_types<NoInputBody, Output> {};
+struct body_types<Output (*)(d1::flow_control&)> : declare_body_types<NoInputBody, Output> {};
 
 template <typename Body>
 using input_t = typename body_types<Body>::input_type;
@@ -100,7 +100,7 @@ decltype(decide_on_operator_overload(std::declval<Body>())) decide_on_callable_t
 template <typename GraphOrSet, typename Body>
 input_node(GraphOrSet&&, Body)
 ->input_node<output_t<decltype(decide_on_callable_type<Body>(0))>>;
-    
+
 #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
 
 template <typename NodeSet>
@@ -268,7 +268,7 @@ template <typename NodeSet>
 write_once_node(const NodeSet&)
 ->write_once_node<decide_on_set_t<NodeSet>>;
 #endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
-} // namespace d1
+} // namespace d2
 } // namespace detail
 } // namespace tbb
 
diff --git a/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_tagged_buffer_impl.h b/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_tagged_buffer_impl.h
index 0d9de17654..0f7c0d174f 100644
--- a/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_tagged_buffer_impl.h
+++ b/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_tagged_buffer_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -30,32 +30,88 @@
 
 // elements in the table are a simple list; we need pointer to next element to
 // traverse the chain
-template<typename ValueType>
-struct buffer_element_type {
-    // the second parameter below is void * because we can't forward-declare the type
-    // itself, so we just reinterpret_cast below.
-    typedef typename aligned_pair<ValueType, void *>::type type;
+
+template <typename Key, typename ValueType>
+struct hash_buffer_element : public aligned_pair<ValueType, void*> {
+    using key_type = Key;
+    using value_type = ValueType;
+
+    value_type* get_value_ptr() { return reinterpret_cast<value_type*>(this->first); }
+    hash_buffer_element* get_next() { return reinterpret_cast<hash_buffer_element*>(this->second); }
+    void set_next(hash_buffer_element* new_next) { this->second = reinterpret_cast<void*>(new_next); }
+
+    void create_element(const value_type& v) {
+        ::new(this->first) value_type(v);
+    }
+
+    void create_element(hash_buffer_element&& other) {
+        ::new(this->first) value_type(std::move(*other.get_value_ptr()));
+    }
+
+    void destroy_element() {
+        get_value_ptr()->~value_type();
+    }
+};
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+template <typename Key, typename ValueType>
+struct metainfo_hash_buffer_element : public aligned_triple<ValueType, void*, message_metainfo> {
+    using key_type = Key;
+    using value_type = ValueType;
+
+    value_type* get_value_ptr() { return reinterpret_cast<value_type*>(this->first); }
+    metainfo_hash_buffer_element* get_next() {
+        return reinterpret_cast<metainfo_hash_buffer_element*>(this->second);
+    }
+    void set_next(metainfo_hash_buffer_element* new_next) { this->second = reinterpret_cast<void*>(new_next); }
+    message_metainfo& get_metainfo() { return this->third; }
+
+    void create_element(const value_type& v, const message_metainfo& metainfo) {
+        __TBB_ASSERT(this->third.empty(), nullptr);
+        ::new(this->first) value_type(v);
+        this->third = metainfo;
+
+        for (auto waiter : metainfo.waiters()) {
+            waiter->reserve(1);
+        }
+    }
+
+    void create_element(metainfo_hash_buffer_element&& other) {
+        __TBB_ASSERT(this->third.empty(), nullptr);
+        ::new(this->first) value_type(std::move(*other.get_value_ptr()));
+        this->third = std::move(other.get_metainfo());
+    }
+
+    void destroy_element() {
+        get_value_ptr()->~value_type();
+
+        for (auto waiter : get_metainfo().waiters()) {
+            waiter->release(1);
+        }
+        get_metainfo() = message_metainfo{};
+    }
 };
+#endif
 
 template
     <
-     typename Key,         // type of key within ValueType
-     typename ValueType,
+     typename ElementType,
      typename ValueToKey,  // abstract method that returns "const Key" or "const Key&" given ValueType
      typename HashCompare, // has hash and equal
-     typename Allocator=tbb::cache_aligned_allocator< typename aligned_pair<ValueType, void *>::type >
+     typename Allocator=tbb::cache_aligned_allocator<ElementType>
     >
-class hash_buffer : public HashCompare {
+class hash_buffer_impl : public HashCompare {
 public:
     static const size_t INITIAL_SIZE = 8;  // initial size of the hash pointer table
-    typedef ValueType value_type;
-    typedef typename buffer_element_type< value_type >::type element_type;
+    typedef typename ElementType::key_type key_type;
+    typedef typename ElementType::value_type value_type;
+    typedef ElementType element_type;
     typedef value_type *pointer_type;
     typedef element_type *list_array_type;  // array we manage manually
     typedef list_array_type *pointer_array_type;
     typedef typename std::allocator_traits<Allocator>::template rebind_alloc<list_array_type> pointer_array_allocator_type;
     typedef typename std::allocator_traits<Allocator>::template rebind_alloc<element_type> elements_array_allocator;
-    typedef typename std::decay<Key>::type Knoref;
+    typedef typename std::decay<key_type>::type Knoref;
 
 private:
     ValueToKey *my_key;
@@ -69,9 +125,9 @@ class hash_buffer : public HashCompare {
 
     void set_up_free_list( element_type **p_free_list, list_array_type la, size_t sz) {
         for(size_t i=0; i < sz - 1; ++i ) {  // construct free list
-            la[i].second = &(la[i+1]);
+            la[i].set_next(&(la[i + 1]));
         }
-        la[sz-1].second = nullptr;
+        la[sz - 1].set_next(nullptr);
         *p_free_list = (element_type *)&(la[0]);
     }
 
@@ -101,15 +157,18 @@ class hash_buffer : public HashCompare {
         {
             DoCleanup my_cleanup(new_pointer_array, new_elements_array, new_size);
             new_elements_array = elements_array_allocator().allocate(my_size);
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            for (std::size_t i = 0; i < my_size; ++i) {
+                ::new(new_elements_array + i) element_type();
+            }
+#endif
             new_pointer_array = pointer_array_allocator_type().allocate(new_size);
             for(size_t i=0; i < new_size; ++i) new_pointer_array[i] = nullptr;
             set_up_free_list(&new_free_list, new_elements_array, my_size );
 
             for(size_t i=0; i < my_size; ++i) {
-                for( element_type* op = pointer_array[i]; op; op = (element_type *)(op->second)) {
-                    value_type *ov = reinterpret_cast<value_type *>(&(op->first));
-                    // could have std::move semantics
-                    internal_insert_with_key(new_pointer_array, new_size, new_free_list, *ov);
+                for( element_type* op = pointer_array[i]; op; op = (element_type *)(op->get_next())) {
+                    internal_insert_with_key(new_pointer_array, new_size, new_free_list, std::move(*op));
                 }
             }
             my_cleanup.my_pa = nullptr;
@@ -126,15 +185,26 @@ class hash_buffer : public HashCompare {
 
     // v should have perfect forwarding if std::move implemented.
     // we use this method to move elements in grow_array, so can't use class fields
+    template <typename Value, typename... Args>
+    const value_type& get_value_from_pack(const Value& value, const Args&...) {
+        return value;
+    }
+
+    template <typename Element>
+    const value_type& get_value_from_pack(Element&& element) {
+        return *(element.get_value_ptr());
+    }
+
+    template <typename... Args>
     void internal_insert_with_key( element_type **p_pointer_array, size_t p_sz, list_array_type &p_free_list,
-            const value_type &v) {
+                                   Args&&... args) {
         size_t l_mask = p_sz-1;
         __TBB_ASSERT(my_key, "Error: value-to-key functor not provided");
-        size_t h = this->hash(tbb::detail::invoke(*my_key, v)) & l_mask;
+        size_t h = this->hash(tbb::detail::invoke(*my_key, get_value_from_pack(args...))) & l_mask;
         __TBB_ASSERT(p_free_list, "Error: free list not set up.");
-        element_type* my_elem = p_free_list; p_free_list = (element_type *)(p_free_list->second);
-        (void) new(&(my_elem->first)) value_type(v);
-        my_elem->second = p_pointer_array[h];
+        element_type* my_elem = p_free_list; p_free_list = (element_type *)(p_free_list->get_next());
+        my_elem->create_element(std::forward<Args>(args)...);
+        my_elem->set_next(p_pointer_array[h]);
         p_pointer_array[h] = my_elem;
     }
 
@@ -142,6 +212,11 @@ class hash_buffer : public HashCompare {
         pointer_array = pointer_array_allocator_type().allocate(my_size);
         for(size_t i = 0; i < my_size; ++i) pointer_array[i] = nullptr;
         elements_array = elements_array_allocator().allocate(my_size / 2);
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        for (std::size_t i = 0; i < my_size / 2; ++i) {
+            ::new(elements_array + i) element_type();
+        }
+#endif
         set_up_free_list(&free_list, elements_array, my_size / 2);
     }
 
@@ -151,13 +226,8 @@ class hash_buffer : public HashCompare {
             for(size_t i = 0; i < sz; ++i ) {
                 element_type *p_next;
                 for( element_type *p = pa[i]; p; p = p_next) {
-                    p_next = (element_type *)p->second;
-                    // TODO revamp: make sure type casting is correct.
-                    void* ptr = (void*)(p->first);
-#if _MSC_VER && _MSC_VER <= 1900 && !__INTEL_COMPILER
-                    suppress_unused_warning(ptr);
-#endif
-                    ((value_type*)ptr)->~value_type();
+                    p_next = p->get_next();
+                    p->destroy_element();
                 }
             }
             pointer_array_allocator_type().deallocate(pa, sz);
@@ -166,6 +236,11 @@ class hash_buffer : public HashCompare {
         // Separate test (if allocation of pa throws, el may be allocated.
         // but no elements will be constructed.)
         if(el) {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            for (std::size_t i = 0; i < sz / 2; ++i) {
+                (el + i)->~element_type();
+            }
+#endif
             elements_array_allocator().deallocate(el, sz / 2);
             el = nullptr;
         }
@@ -174,17 +249,17 @@ class hash_buffer : public HashCompare {
     }
 
 public:
-    hash_buffer() : my_key(nullptr), my_size(INITIAL_SIZE), nelements(0) {
+    hash_buffer_impl() : my_key(nullptr), my_size(INITIAL_SIZE), nelements(0) {
         internal_initialize_buffer();
     }
 
-    ~hash_buffer() {
+    ~hash_buffer_impl() {
         internal_free_buffer(pointer_array, elements_array, my_size, nelements);
         delete my_key;
         my_key = nullptr;
     }
-    hash_buffer(const hash_buffer&) = delete;
-    hash_buffer& operator=(const hash_buffer&) = delete;
+    hash_buffer_impl(const hash_buffer_impl&) = delete;
+    hash_buffer_impl& operator=(const hash_buffer_impl&) = delete;
 
     void reset() {
         internal_free_buffer(pointer_array, elements_array, my_size, nelements);
@@ -197,34 +272,41 @@ class hash_buffer : public HashCompare {
     // pointer is used to clone()
     ValueToKey* get_key_func() { return my_key; }
 
-    bool insert_with_key(const value_type &v) {
-        pointer_type p = nullptr;
+    template <typename... Args>
+    bool insert_with_key(const value_type &v, Args&&... args) {
+        element_type* p = nullptr;
         __TBB_ASSERT(my_key, "Error: value-to-key functor not provided");
-        if(find_ref_with_key(tbb::detail::invoke(*my_key, v), p)) {
-            p->~value_type();
-            (void) new(p) value_type(v);  // copy-construct into the space
+        if(find_element_ref_with_key(tbb::detail::invoke(*my_key, v), p)) {
+            p->destroy_element();
+            p->create_element(v, std::forward<Args>(args)...);
             return false;
         }
         ++nelements;
         if(nelements*2 > my_size) grow_array();
-        internal_insert_with_key(pointer_array, my_size, free_list, v);
+        internal_insert_with_key(pointer_array, my_size, free_list, v, std::forward<Args>(args)...);
         return true;
     }
 
-    // returns true and sets v to array element if found, else returns false.
-    bool find_ref_with_key(const Knoref& k, pointer_type &v) {
+    bool find_element_ref_with_key(const Knoref& k, element_type*& v) {
         size_t i = this->hash(k) & mask();
-        for(element_type* p = pointer_array[i]; p; p = (element_type *)(p->second)) {
-            pointer_type pv = reinterpret_cast<pointer_type>(&(p->first));
+        for(element_type* p = pointer_array[i]; p; p = (element_type *)(p->get_next())) {
             __TBB_ASSERT(my_key, "Error: value-to-key functor not provided");
-            if(this->equal(tbb::detail::invoke(*my_key, *pv), k)) {
-                v = pv;
+            if(this->equal(tbb::detail::invoke(*my_key, *p->get_value_ptr()), k)) {
+                v = p;
                 return true;
             }
         }
         return false;
     }
 
+    // returns true and sets v to array element if found, else returns false.
+    bool find_ref_with_key(const Knoref& k, pointer_type &v) {
+        element_type* element_ptr = nullptr;
+        bool res = find_element_ref_with_key(k, element_ptr);
+        v = element_ptr->get_value_ptr();
+        return res;
+    }
+
     bool find_with_key( const Knoref& k, value_type &v) {
         value_type *p;
         if(find_ref_with_key(k, p)) {
@@ -238,14 +320,14 @@ class hash_buffer : public HashCompare {
     void delete_with_key(const Knoref& k) {
         size_t h = this->hash(k) & mask();
         element_type* prev = nullptr;
-        for(element_type* p = pointer_array[h]; p; prev = p, p = (element_type *)(p->second)) {
-            value_type *vp = reinterpret_cast<value_type *>(&(p->first));
+        for(element_type* p = pointer_array[h]; p; prev = p, p = (element_type *)(p->get_next())) {
+            value_type *vp = p->get_value_ptr();
             __TBB_ASSERT(my_key, "Error: value-to-key functor not provided");
             if(this->equal(tbb::detail::invoke(*my_key, *vp), k)) {
-                vp->~value_type();
-                if(prev) prev->second = p->second;
-                else pointer_array[h] = (element_type *)(p->second);
-                p->second = free_list;
+                p->destroy_element();
+                if(prev) prev->set_next(p->get_next());
+                else pointer_array[h] = (element_type *)(p->get_next());
+                p->set_next(free_list);
                 free_list = p;
                 --nelements;
                 return;
@@ -254,4 +336,45 @@ class hash_buffer : public HashCompare {
         __TBB_ASSERT(false, "key not found for delete");
     }
 };
+
+template
+    <
+     typename Key,         // type of key within ValueType
+     typename ValueType,
+     typename ValueToKey,  // abstract method that returns "const Key" or "const Key&" given ValueType
+     typename HashCompare, // has hash and equal
+     typename Allocator=tbb::cache_aligned_allocator<hash_buffer_element<Key, ValueType>>
+    >
+using hash_buffer = hash_buffer_impl<hash_buffer_element<Key, ValueType>,
+                                     ValueToKey, HashCompare, Allocator>;
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+template
+    <
+     typename Key,         // type of key within ValueType
+     typename ValueType,
+     typename ValueToKey,  // abstract method that returns "const Key" or "const Key&" given ValueType
+     typename HashCompare, // has hash and equal
+     typename Allocator=tbb::cache_aligned_allocator<metainfo_hash_buffer_element<Key, ValueType>>
+    >
+struct metainfo_hash_buffer : public hash_buffer_impl<metainfo_hash_buffer_element<Key, ValueType>,
+                                               ValueToKey, HashCompare, Allocator>
+{
+private:
+    using base_type = hash_buffer_impl<metainfo_hash_buffer_element<Key, ValueType>,
+                                       ValueToKey, HashCompare, Allocator>;
+public:
+    bool find_with_key(const typename base_type::Knoref& k,
+                       typename base_type::value_type& v, message_metainfo& metainfo)
+    {
+        typename base_type::element_type* p = nullptr;
+        bool result = this->find_element_ref_with_key(k, p);
+        if (result) {
+            v = *(p->get_value_ptr());
+            metainfo = p->get_metainfo();
+        }
+        return result;
+    }
+};
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
 #endif // __TBB__flow_graph_hash_buffer_impl_H
diff --git a/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_trace_impl.h b/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_trace_impl.h
index a161dd0362..74ebf08456 100644
--- a/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_trace_impl.h
+++ b/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_trace_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
 
 namespace tbb {
 namespace detail {
-namespace d1 {
+namespace d2 {
 
 template< typename T > class sender;
 template< typename T > class receiver;
@@ -44,29 +44,29 @@ template< typename T > class receiver;
 
 static inline void fgt_alias_port(void *node, void *p, bool visible) {
     if(visible)
-        itt_relation_add( ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_NODE );
+        itt_relation_add( d1::ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_NODE );
     else
-        itt_relation_add( ITT_DOMAIN_FLOW, p, FLOW_NODE, __itt_relation_is_child_of, node, FLOW_NODE );
+        itt_relation_add( d1::ITT_DOMAIN_FLOW, p, FLOW_NODE, __itt_relation_is_child_of, node, FLOW_NODE );
 }
 
 static inline void fgt_composite ( void* codeptr, void *node, void *graph ) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, node, FLOW_NODE, graph, FLOW_GRAPH, FLOW_COMPOSITE_NODE );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, node, FLOW_NODE, graph, FLOW_GRAPH, FLOW_COMPOSITE_NODE );
     suppress_unused_warning( codeptr );
 #if __TBB_FLOW_TRACE_CODEPTR
     if (codeptr != nullptr) {
-        register_node_addr(ITT_DOMAIN_FLOW, node, FLOW_NODE, CODE_ADDRESS, &codeptr);
+        register_node_addr(d1::ITT_DOMAIN_FLOW, node, FLOW_NODE, CODE_ADDRESS, &codeptr);
     }
 #endif
 }
 
 static inline void fgt_internal_alias_input_port( void *node, void *p, string_resource_index name_index ) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, p, FLOW_INPUT_PORT, node, FLOW_NODE, name_index );
-    itt_relation_add( ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_INPUT_PORT );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, p, FLOW_INPUT_PORT, node, FLOW_NODE, name_index );
+    itt_relation_add( d1::ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_INPUT_PORT );
 }
 
 static inline void fgt_internal_alias_output_port( void *node, void *p, string_resource_index name_index ) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, p, FLOW_OUTPUT_PORT, node, FLOW_NODE, name_index );
-    itt_relation_add( ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_OUTPUT_PORT );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, p, FLOW_OUTPUT_PORT, node, FLOW_NODE, name_index );
+    itt_relation_add( d1::ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_OUTPUT_PORT );
 }
 
 template<typename InputType>
@@ -109,15 +109,15 @@ struct fgt_internal_output_alias_helper<PortsTuple, 0> {
 };
 
 static inline void fgt_internal_create_input_port( void *node, void *p, string_resource_index name_index ) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, p, FLOW_INPUT_PORT, node, FLOW_NODE, name_index );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, p, FLOW_INPUT_PORT, node, FLOW_NODE, name_index );
 }
 
 static inline void fgt_internal_create_output_port( void* codeptr, void *node, void *p, string_resource_index name_index ) {
-    itt_make_task_group(ITT_DOMAIN_FLOW, p, FLOW_OUTPUT_PORT, node, FLOW_NODE, name_index);
+    itt_make_task_group(d1::ITT_DOMAIN_FLOW, p, FLOW_OUTPUT_PORT, node, FLOW_NODE, name_index);
     suppress_unused_warning( codeptr );
 #if __TBB_FLOW_TRACE_CODEPTR
     if (codeptr != nullptr) {
-        register_node_addr(ITT_DOMAIN_FLOW, node, FLOW_NODE, CODE_ADDRESS, &codeptr);
+        register_node_addr(d1::ITT_DOMAIN_FLOW, node, FLOW_NODE, CODE_ADDRESS, &codeptr);
     }
 #endif
 }
@@ -167,40 +167,40 @@ struct fgt_internal_output_helper<PortsTuple,1> {
 template< typename NodeType >
 void fgt_multioutput_node_desc( const NodeType *node, const char *desc ) {
     void *addr =  (void *)( static_cast< receiver< typename NodeType::input_type > * >(const_cast< NodeType *>(node)) );
-    itt_metadata_str_add( ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc );
+    itt_metadata_str_add( d1::ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc );
 }
 
 template< typename NodeType >
 void fgt_multiinput_multioutput_node_desc( const NodeType *node, const char *desc ) {
     void *addr =  const_cast<NodeType *>(node);
-    itt_metadata_str_add( ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc );
+    itt_metadata_str_add( d1::ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc );
 }
 
 template< typename NodeType >
 static inline void fgt_node_desc( const NodeType *node, const char *desc ) {
     void *addr =  (void *)( static_cast< sender< typename NodeType::output_type > * >(const_cast< NodeType *>(node)) );
-    itt_metadata_str_add( ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc );
+    itt_metadata_str_add( d1::ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc );
 }
 
 static inline void fgt_graph_desc( const void *g, const char *desc ) {
     void *addr = const_cast< void *>(g);
-    itt_metadata_str_add( ITT_DOMAIN_FLOW, addr, FLOW_GRAPH, FLOW_OBJECT_NAME, desc );
+    itt_metadata_str_add( d1::ITT_DOMAIN_FLOW, addr, FLOW_GRAPH, FLOW_OBJECT_NAME, desc );
 }
 
 static inline void fgt_body( void *node, void *body ) {
-    itt_relation_add( ITT_DOMAIN_FLOW, body, FLOW_BODY, __itt_relation_is_child_of, node, FLOW_NODE );
+    itt_relation_add( d1::ITT_DOMAIN_FLOW, body, FLOW_BODY, __itt_relation_is_child_of, node, FLOW_NODE );
 }
 
 template< int N, typename PortsTuple >
 static inline void fgt_multioutput_node(void* codeptr, string_resource_index t, void *g, void *input_port, PortsTuple &ports ) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, input_port, FLOW_NODE, g, FLOW_GRAPH, t );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, input_port, FLOW_NODE, g, FLOW_GRAPH, t );
     fgt_internal_create_input_port( input_port, input_port, FLOW_INPUT_PORT_0 );
     fgt_internal_output_helper<PortsTuple, N>::register_port(codeptr, input_port, ports );
 }
 
 template< int N, typename PortsTuple >
 static inline void fgt_multioutput_node_with_body( void* codeptr, string_resource_index t, void *g, void *input_port, PortsTuple &ports, void *body ) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, input_port, FLOW_NODE, g, FLOW_GRAPH, t );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, input_port, FLOW_NODE, g, FLOW_GRAPH, t );
     fgt_internal_create_input_port( input_port, input_port, FLOW_INPUT_PORT_0 );
     fgt_internal_output_helper<PortsTuple, N>::register_port( codeptr, input_port, ports );
     fgt_body( input_port, body );
@@ -208,28 +208,28 @@ static inline void fgt_multioutput_node_with_body( void* codeptr, string_resourc
 
 template< int N, typename PortsTuple >
 static inline void fgt_multiinput_node( void* codeptr, string_resource_index t, void *g, PortsTuple &ports, void *output_port) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t );
     fgt_internal_create_output_port( codeptr, output_port, output_port, FLOW_OUTPUT_PORT_0 );
     fgt_internal_input_helper<PortsTuple, N>::register_port( output_port, ports );
 }
 
 static inline void fgt_multiinput_multioutput_node( void* codeptr, string_resource_index t, void *n, void *g ) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, n, FLOW_NODE, g, FLOW_GRAPH, t );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, n, FLOW_NODE, g, FLOW_GRAPH, t );
     suppress_unused_warning( codeptr );
 #if __TBB_FLOW_TRACE_CODEPTR
     if (codeptr != nullptr) {
-        register_node_addr(ITT_DOMAIN_FLOW, n, FLOW_NODE, CODE_ADDRESS, &codeptr);
+        register_node_addr(d1::ITT_DOMAIN_FLOW, n, FLOW_NODE, CODE_ADDRESS, &codeptr);
     }
 #endif
 }
 
 static inline void fgt_node( void* codeptr, string_resource_index t, void *g, void *output_port ) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t );
     fgt_internal_create_output_port( codeptr, output_port, output_port, FLOW_OUTPUT_PORT_0 );
 }
 
 static void fgt_node_with_body( void* codeptr, string_resource_index t, void *g, void *output_port, void *body ) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t );
     fgt_internal_create_output_port(codeptr, output_port, output_port, FLOW_OUTPUT_PORT_0 );
     fgt_body( output_port, body );
 }
@@ -251,47 +251,47 @@ static inline void  fgt_node( void* codeptr, string_resource_index t, void *g, v
 }
 
 static inline void fgt_make_edge( void *output_port, void *input_port ) {
-    itt_relation_add( ITT_DOMAIN_FLOW, output_port, FLOW_OUTPUT_PORT, __itt_relation_is_predecessor_to, input_port, FLOW_INPUT_PORT);
+    itt_relation_add( d1::ITT_DOMAIN_FLOW, output_port, FLOW_OUTPUT_PORT, __itt_relation_is_predecessor_to, input_port, FLOW_INPUT_PORT);
 }
 
 static inline void fgt_remove_edge( void *output_port, void *input_port ) {
-    itt_relation_add( ITT_DOMAIN_FLOW, output_port, FLOW_OUTPUT_PORT, __itt_relation_is_sibling_of, input_port, FLOW_INPUT_PORT);
+    itt_relation_add( d1::ITT_DOMAIN_FLOW, output_port, FLOW_OUTPUT_PORT, __itt_relation_is_sibling_of, input_port, FLOW_INPUT_PORT);
 }
 
 static inline void fgt_graph( void *g ) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, g, FLOW_GRAPH, nullptr, FLOW_NULL, FLOW_GRAPH );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, g, FLOW_GRAPH, nullptr, FLOW_NULL, FLOW_GRAPH );
 }
 
 static inline void fgt_begin_body( void *body ) {
-    itt_task_begin( ITT_DOMAIN_FLOW, body, FLOW_BODY, nullptr, FLOW_NULL, FLOW_BODY );
+    itt_task_begin( d1::ITT_DOMAIN_FLOW, body, FLOW_BODY, nullptr, FLOW_NULL, FLOW_BODY );
 }
 
 static inline void fgt_end_body( void * ) {
-    itt_task_end( ITT_DOMAIN_FLOW );
+    itt_task_end( d1::ITT_DOMAIN_FLOW );
 }
 
 static inline void fgt_async_try_put_begin( void *node, void *port ) {
-    itt_task_begin( ITT_DOMAIN_FLOW, port, FLOW_OUTPUT_PORT, node, FLOW_NODE, FLOW_OUTPUT_PORT );
+    itt_task_begin( d1::ITT_DOMAIN_FLOW, port, FLOW_OUTPUT_PORT, node, FLOW_NODE, FLOW_OUTPUT_PORT );
 }
 
 static inline void fgt_async_try_put_end( void *, void * ) {
-    itt_task_end( ITT_DOMAIN_FLOW );
+    itt_task_end( d1::ITT_DOMAIN_FLOW );
 }
 
 static inline void fgt_async_reserve( void *node, void *graph ) {
-    itt_region_begin( ITT_DOMAIN_FLOW, node, FLOW_NODE, graph, FLOW_GRAPH, FLOW_NULL );
+    itt_region_begin( d1::ITT_DOMAIN_FLOW, node, FLOW_NODE, graph, FLOW_GRAPH, FLOW_NULL );
 }
 
 static inline void fgt_async_commit( void *node, void * /*graph*/) {
-    itt_region_end( ITT_DOMAIN_FLOW, node, FLOW_NODE );
+    itt_region_end( d1::ITT_DOMAIN_FLOW, node, FLOW_NODE );
 }
 
 static inline void fgt_reserve_wait( void *graph ) {
-    itt_region_begin( ITT_DOMAIN_FLOW, graph, FLOW_GRAPH, nullptr, FLOW_NULL, FLOW_NULL );
+    itt_region_begin( d1::ITT_DOMAIN_FLOW, graph, FLOW_GRAPH, nullptr, FLOW_NULL, FLOW_NULL );
 }
 
 static inline void fgt_release_wait( void *graph ) {
-    itt_region_end( ITT_DOMAIN_FLOW, graph, FLOW_GRAPH );
+    itt_region_end( d1::ITT_DOMAIN_FLOW, graph, FLOW_GRAPH );
 }
 
 #else // TBB_USE_PROFILING_TOOLS
@@ -357,7 +357,7 @@ struct fgt_internal_output_alias_helper {
 
 #endif // TBB_USE_PROFILING_TOOLS
 
-} // d1
+} // d2
 } // namespace detail
 } // namespace tbb
 
diff --git a/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_types_impl.h b/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_types_impl.h
index 4827551d85..e361b23e7b 100644
--- a/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_types_impl.h
+++ b/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_types_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #error Do not #include this internal file directly; use public TBB headers instead.
 #endif
 
-// included in namespace tbb::detail::d1
+// included in namespace tbb::detail::d2
 
 // the change to key_matching (adding a K and KHash template parameter, making it a class)
 // means we have to pass this data to the key_matching_port.  All the ports have only one
@@ -73,40 +73,55 @@ struct make_sequence < 0, S... > {
     typedef sequence<S...> type;
 };
 
-//! type mimicking std::pair but with trailing fill to ensure each element of an array
-//* will have the correct alignment
-template<typename T1, typename T2, size_t REM>
-struct type_plus_align {
-    char first[sizeof(T1)];
-    T2 second;
-    char fill1[REM];
+template<class U> struct alignment_of {
+    typedef struct { char t; U    padded; } test_alignment;
+    static const size_t value = sizeof(test_alignment) - sizeof(U);
 };
 
-template<typename T1, typename T2>
-struct type_plus_align<T1,T2,0> {
-    char first[sizeof(T1)];
-    T2 second;
+template <typename... Types>
+struct max_alignment_helper;
+
+template <typename T1, typename... Types>
+struct max_alignment_helper<T1, Types...> {
+    using type = typename max_alignment_helper<T1, typename max_alignment_helper<Types...>::type>::type;
 };
 
-template<class U> struct alignment_of {
-    typedef struct { char t; U    padded; } test_alignment;
-    static const size_t value = sizeof(test_alignment) - sizeof(U);
+template <typename T1, typename T2>
+struct max_alignment_helper<T1, T2> {
+    using type = typename std::conditional<alignof(T1) < alignof(T2), T2, T1>::type;
 };
 
+template <typename... Types>
+using max_alignment_helper_t = typename max_alignment_helper<Types...>::type;
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#pragma warning(push)
+#pragma warning(disable: 4324) // warning C4324: structure was padded due to alignment specifier
+#endif
+
 // T1, T2 are actual types stored.  The space defined for T1 in the type returned
 // is a char array of the correct size.  Type T2 should be trivially-constructible,
 // T1 must be explicitly managed.
-template<typename T1, typename T2>
-struct aligned_pair {
-    static const size_t t1_align = alignment_of<T1>::value;
-    static const size_t t2_align = alignment_of<T2>::value;
-    typedef type_plus_align<T1, T2, 0 > just_pair;
-    static const size_t max_align = t1_align < t2_align ? t2_align : t1_align;
-    static const size_t extra_bytes = sizeof(just_pair) % max_align;
-    static const size_t remainder = extra_bytes ? max_align - extra_bytes : 0;
-public:
-    typedef type_plus_align<T1,T2,remainder> type;
-};  // aligned_pair
+
+template <typename T1, typename T2>
+struct alignas(alignof(max_alignment_helper_t<T1, T2>)) aligned_pair {
+    char first[sizeof(T1)];
+    T2 second;
+};
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+template <typename T1, typename T2, typename T3>
+struct alignas(alignof(max_alignment_helper_t<T1, T2, T3>)) aligned_triple {
+    char first[sizeof(T1)];
+    T2 second;
+    T3 third;
+};
+#endif
+
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#pragma warning(pop) // warning 4324 is back
+#endif
 
 // support for variant type
 // type we use when we're not storing a value
diff --git a/third-party/tbb/include/oneapi/tbb/detail/_pipeline_filters.h b/third-party/tbb/include/oneapi/tbb/detail/_pipeline_filters.h
index 46e7b95d6c..8121946729 100644
--- a/third-party/tbb/include/oneapi/tbb/detail/_pipeline_filters.h
+++ b/third-party/tbb/include/oneapi/tbb/detail/_pipeline_filters.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -32,6 +32,12 @@ namespace d1 {
 class base_filter;
 }
 
+namespace d2 {
+template <typename Output>
+__TBB_requires(std::copyable<Output>)
+class input_node;
+}
+
 namespace r1 {
 TBB_EXPORT void __TBB_EXPORTED_FUNC set_end_of_input(d1::base_filter&);
 class pipeline;
@@ -131,7 +137,7 @@ class flow_control {
     template<typename Body, typename InputType, typename OutputType > friend class concrete_filter;
     template<typename Output>
     __TBB_requires(std::copyable<Output>)
-    friend class input_node;
+    friend class d2::input_node;
 public:
     void stop() { is_pipeline_stopped = true; }
 };
diff --git a/third-party/tbb/include/oneapi/tbb/detail/_task.h b/third-party/tbb/include/oneapi/tbb/detail/_task.h
index 636aea97b4..e1bb70c5be 100644
--- a/third-party/tbb/include/oneapi/tbb/detail/_task.h
+++ b/third-party/tbb/include/oneapi/tbb/detail/_task.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2020-2023 Intel Corporation
+    Copyright (c) 2020-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -43,6 +43,13 @@ class task;
 class wait_context;
 class task_group_context;
 struct execution_data;
+class wait_tree_vertex_interface;
+class task_arena_base;
+}
+
+namespace d2 {
+class task_group;
+class task_group_base;
 }
 
 namespace r1 {
@@ -52,7 +59,9 @@ TBB_EXPORT void __TBB_EXPORTED_FUNC spawn(d1::task& t, d1::task_group_context& c
 TBB_EXPORT void __TBB_EXPORTED_FUNC execute_and_wait(d1::task& t, d1::task_group_context& t_ctx, d1::wait_context&, d1::task_group_context& w_ctx);
 TBB_EXPORT void __TBB_EXPORTED_FUNC wait(d1::wait_context&, d1::task_group_context& ctx);
 TBB_EXPORT d1::slot_id __TBB_EXPORTED_FUNC execution_slot(const d1::execution_data*);
+TBB_EXPORT d1::slot_id __TBB_EXPORTED_FUNC execution_slot(const d1::task_arena_base&);
 TBB_EXPORT d1::task_group_context* __TBB_EXPORTED_FUNC current_context();
+TBB_EXPORT d1::wait_tree_vertex_interface* get_thread_reference_vertex(d1::wait_tree_vertex_interface* wc);
 
 // Do not place under __TBB_RESUMABLE_TASKS. It is a stub for unsupported platforms.
 struct suspend_point_type;
@@ -124,8 +133,7 @@ class wait_context {
     friend class r1::thread_data;
     friend class r1::task_dispatcher;
     friend class r1::external_waiter;
-    friend class task_group;
-    friend class task_group_base;
+    friend class wait_context_vertex;
     friend struct r1::task_arena_impl;
     friend struct r1::suspend_point_type;
 public:
@@ -147,6 +155,67 @@ class wait_context {
     }
 };
 
+class wait_tree_vertex_interface {
+public:
+    virtual void reserve(std::uint32_t delta = 1) = 0;
+    virtual void release(std::uint32_t delta = 1) = 0;
+
+protected:
+    virtual ~wait_tree_vertex_interface() = default;
+};
+
+class wait_context_vertex : public wait_tree_vertex_interface {
+public:
+    wait_context_vertex(std::uint32_t ref = 0) : m_wait(ref) {}
+
+    void reserve(std::uint32_t delta = 1) override {
+        m_wait.reserve(delta);
+    }
+
+    void release(std::uint32_t delta = 1) override {
+        m_wait.release(delta);
+    }
+
+    wait_context& get_context() {
+        return m_wait;
+    }
+private:
+    friend class d2::task_group;
+    friend class d2::task_group_base;
+
+    bool continue_execution() const {
+        return m_wait.continue_execution();
+    }
+
+    wait_context m_wait;
+};
+
+class reference_vertex : public wait_tree_vertex_interface {
+public:
+    reference_vertex(wait_tree_vertex_interface* parent, std::uint32_t ref_count) : my_parent{parent}, m_ref_count{ref_count}
+    {}
+
+    void reserve(std::uint32_t delta = 1) override {
+        if (m_ref_count.fetch_add(static_cast<std::uint64_t>(delta)) == 0) {
+            my_parent->reserve();
+        }
+    }
+
+    void release(std::uint32_t delta = 1) override {
+        std::uint64_t ref = m_ref_count.fetch_sub(static_cast<std::uint64_t>(delta)) - static_cast<std::uint64_t>(delta);
+        if (ref == 0) {
+            my_parent->release();
+        }
+    }
+
+    std::uint32_t get_num_child() {
+        return static_cast<std::uint32_t>(m_ref_count.load(std::memory_order_acquire));
+    }
+private:
+    wait_tree_vertex_interface* my_parent;
+    std::atomic<std::uint64_t> m_ref_count;
+};
+
 struct execution_data {
     task_group_context* context{};
     slot_id original_slot{};
diff --git a/third-party/tbb/include/oneapi/tbb/detail/_task_handle.h b/third-party/tbb/include/oneapi/tbb/detail/_task_handle.h
index e32154f409..26212b462c 100644
--- a/third-party/tbb/include/oneapi/tbb/detail/_task_handle.h
+++ b/third-party/tbb/include/oneapi/tbb/detail/_task_handle.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2020-2021 Intel Corporation
+    Copyright (c) 2020-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@ class task_handle;
 
 class task_handle_task : public d1::task {
     std::uint64_t m_version_and_traits{};
-    d1::wait_context& m_wait_ctx;
+    d1::wait_tree_vertex_interface* m_wait_tree_vertex;
     d1::task_group_context& m_ctx;
     d1::small_object_allocator m_allocator;
 public:
@@ -46,15 +46,16 @@ class task_handle_task : public d1::task {
         }
     }
 
-    task_handle_task(d1::wait_context& wo, d1::task_group_context& ctx, d1::small_object_allocator& alloc)
-        : m_wait_ctx(wo)
+    task_handle_task(d1::wait_tree_vertex_interface* vertex, d1::task_group_context& ctx, d1::small_object_allocator& alloc)
+        : m_wait_tree_vertex(vertex)
         , m_ctx(ctx)
         , m_allocator(alloc) {
         suppress_unused_warning(m_version_and_traits);
+        m_wait_tree_vertex->reserve();
     }
 
     ~task_handle_task() override {
-        m_wait_ctx.release();
+        m_wait_tree_vertex->release();
     }
 
     d1::task_group_context& ctx() const { return m_ctx; }
diff --git a/third-party/tbb/include/oneapi/tbb/flow_graph.h b/third-party/tbb/include/oneapi/tbb/flow_graph.h
index 2df4b14050..20916fa7c2 100644
--- a/third-party/tbb/include/oneapi/tbb/flow_graph.h
+++ b/third-party/tbb/include/oneapi/tbb/flow_graph.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -52,6 +52,7 @@
 
 #include <tuple>
 #include <list>
+#include <forward_list>
 #include <queue>
 #if __TBB_CPP20_CONCEPTS_PRESENT
 #include <concepts>
@@ -70,7 +71,7 @@
 namespace tbb {
 namespace detail {
 
-namespace d1 {
+namespace d2 {
 
 //! An enumeration the provides the two most common concurrency levels: unlimited and serial
 enum concurrency { unlimited = 0, serial = 1 };
@@ -81,19 +82,19 @@ struct null_type {};
 //! An empty class used for messages that mean "I'm done"
 class continue_msg {};
 
-} // namespace d1
+} // namespace d2
 
 #if __TBB_CPP20_CONCEPTS_PRESENT
 namespace d0 {
 
 template <typename ReturnType, typename OutputType>
-concept node_body_return_type = std::same_as<OutputType, tbb::detail::d1::continue_msg> ||
+concept node_body_return_type = std::same_as<OutputType, tbb::detail::d2::continue_msg> ||
                                 std::convertible_to<OutputType, ReturnType>;
 
 // TODO: consider using std::invocable here
 template <typename Body, typename Output>
 concept continue_node_body = std::copy_constructible<Body> &&
-                             requires( Body& body, const tbb::detail::d1::continue_msg& v ) {
+                             requires( Body& body, const tbb::detail::d2::continue_msg& v ) {
                                  { body(v) } -> node_body_return_type<Output>;
                              };
 
@@ -129,7 +130,7 @@ concept async_node_body = std::copy_constructible<Body> &&
 } // namespace d0
 #endif // __TBB_CPP20_CONCEPTS_PRESENT
 
-namespace d1 {
+namespace d2 {
 
 //! Forward declaration section
 template< typename T > class sender;
@@ -153,7 +154,7 @@ template<typename Order, typename... Args> struct node_set;
 #endif
 
 
-} // namespace d1
+} // namespace d2
 } // namespace detail
 } // namespace tbb
 
@@ -162,7 +163,7 @@ template<typename Order, typename... Args> struct node_set;
 
 namespace tbb {
 namespace detail {
-namespace d1 {
+namespace d2 {
 
 static inline std::pair<graph_task*, graph_task*> order_tasks(graph_task* first, graph_task* second) {
     if (second->priority > first->priority)
@@ -187,6 +188,37 @@ static inline graph_task* combine_tasks(graph& g, graph_task* left, graph_task*
     return left;
 }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+class message_metainfo {
+public:
+    using waiters_type = std::forward_list<d1::wait_context_vertex*>;
+
+    message_metainfo() = default;
+
+    message_metainfo(const waiters_type& waiters) : my_waiters(waiters) {}
+    message_metainfo(waiters_type&& waiters) : my_waiters(std::move(waiters)) {}
+
+    const waiters_type& waiters() const & { return my_waiters; }
+    waiters_type&& waiters() && { return std::move(my_waiters); }
+
+    bool empty() const { return my_waiters.empty(); }
+
+    void merge(const message_metainfo& other) {
+        // TODO: should we avoid duplications on merging
+        my_waiters.insert_after(my_waiters.before_begin(),
+                                other.waiters().begin(),
+                                other.waiters().end());
+    }
+private:
+    waiters_type my_waiters;
+}; // class message_metainfo
+
+#define __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo) , metainfo
+
+#else
+#define __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo)
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
 //! Pure virtual template class that defines a sender of messages of type T
 template< typename T >
 class sender {
@@ -196,9 +228,17 @@ class sender {
     //! Request an item from the sender
     virtual bool try_get( T & ) { return false; }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    virtual bool try_get( T &, message_metainfo& ) { return false; }
+#endif
+
     //! Reserves an item in the sender
     virtual bool try_reserve( T & ) { return false; }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    virtual bool try_reserve( T &, message_metainfo& ) { return false; }
+#endif
+
     //! Releases the reserved item
     virtual bool try_release( ) { return false; }
 
@@ -238,17 +278,38 @@ bool remove_successor(sender<C>& s, receiver<C>& r) {
 //! Pure virtual template class that defines a receiver of messages of type T
 template< typename T >
 class receiver {
+private:
+    template <typename... TryPutTaskArgs>
+    bool internal_try_put(const T& t, TryPutTaskArgs&&... args) {
+        graph_task* res = try_put_task(t, std::forward<TryPutTaskArgs>(args)...);
+        if (!res) return false;
+        if (res != SUCCESSFULLY_ENQUEUED) spawn_in_graph_arena(graph_reference(), *res);
+        return true;
+    }
+
 public:
     //! Destructor
     virtual ~receiver() {}
 
     //! Put an item to the receiver
     bool try_put( const T& t ) {
-        graph_task *res = try_put_task(t);
-        if (!res) return false;
-        if (res != SUCCESSFULLY_ENQUEUED) spawn_in_graph_arena(graph_reference(), *res);
-        return true;
+        return internal_try_put(t);
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    //! Put an item to the receiver and wait for completion
+    bool try_put_and_wait( const T& t ) {
+        // Since try_put_and_wait is a blocking call, it is safe to create wait_context on stack
+        d1::wait_context_vertex msg_wait_vertex{};
+
+        bool res = internal_try_put(t, message_metainfo{message_metainfo::waiters_type{&msg_wait_vertex}});
+        if (res) {
+            __TBB_ASSERT(graph_reference().my_context != nullptr, "No wait_context associated with the Flow Graph");
+            wait(msg_wait_vertex.get_context(), *graph_reference().my_context);
+        }
+        return res;
     }
+#endif
 
     //! put item to successor; return task to run the successor if possible.
 protected:
@@ -262,6 +323,9 @@ class receiver {
     template< typename X, typename Y > friend class broadcast_cache;
     template< typename X, typename Y > friend class round_robin_cache;
     virtual graph_task *try_put_task(const T& t) = 0;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    virtual graph_task *try_put_task(const T& t, const message_metainfo&) = 0;
+#endif
     virtual graph& graph_reference() const = 0;
 
     template<typename TT, typename M> friend class successor_cache;
@@ -337,23 +401,61 @@ class continue_receiver : public receiver< continue_msg > {
     template< typename R, typename B > friend class run_and_put_task;
     template<typename X, typename Y> friend class broadcast_cache;
     template<typename X, typename Y> friend class round_robin_cache;
+
+private:
     // execute body is supposed to be too small to create a task for.
-    graph_task* try_put_task( const input_type & ) override {
+    graph_task* try_put_task_impl( const input_type& __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo) ) {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        message_metainfo predecessor_metainfo;
+#endif
         {
             spin_mutex::scoped_lock l(my_mutex);
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            // Prolong the wait and store the metainfo until receiving signals from all the predecessors
+            for (auto waiter : metainfo.waiters()) {
+                waiter->reserve(1);
+            }
+            my_current_metainfo.merge(metainfo);
+#endif
             if ( ++my_current_count < my_predecessor_count )
                 return SUCCESSFULLY_ENQUEUED;
-            else
+            else {
                 my_current_count = 0;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                predecessor_metainfo = my_current_metainfo;
+                my_current_metainfo = message_metainfo{};
+#endif
+            }
+        }
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        graph_task* res = execute(predecessor_metainfo);
+        for (auto waiter : predecessor_metainfo.waiters()) {
+            waiter->release(1);
         }
+#else
         graph_task* res = execute();
+#endif
         return res? res : SUCCESSFULLY_ENQUEUED;
     }
 
+protected:
+    graph_task* try_put_task( const input_type& input ) override {
+        return try_put_task_impl(input __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task( const input_type& input, const message_metainfo& metainfo ) override {
+        return try_put_task_impl(input, metainfo);
+    }
+#endif
+
     spin_mutex my_mutex;
     int my_predecessor_count;
     int my_current_count;
     int my_initial_predecessor_count;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    message_metainfo my_current_metainfo;
+#endif
     node_priority_t my_priority;
     // the friend declaration in the base class did not eliminate the "protected class"
     // error in gcc 4.1.2
@@ -369,7 +471,11 @@ class continue_receiver : public receiver< continue_msg > {
     //! Does whatever should happen when the threshold is reached
     /** This should be very fast or else spawn a task.  This is
         called while the sender is blocked in the try_put(). */
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    virtual graph_task* execute(const message_metainfo& metainfo) = 0;
+#else
     virtual graph_task* execute() = 0;
+#endif
     template<typename TT, typename M> friend class successor_cache;
     bool is_continue_receiver() override { return true; }
 
@@ -392,7 +498,7 @@ class continue_receiver : public receiver< continue_msg > {
 
 namespace tbb {
 namespace detail {
-namespace d1 {
+namespace d2 {
 
 #include "detail/_flow_graph_body_impl.h"
 #include "detail/_flow_graph_cache_impl.h"
@@ -424,7 +530,7 @@ void graph_iterator<C,N>::internal_forward() {
 }
 
 //! Constructs a graph with isolated task_group_context
-inline graph::graph() : my_wait_context(0), my_nodes(nullptr), my_nodes_last(nullptr), my_task_arena(nullptr) {
+inline graph::graph() : my_wait_context_vertex(0), my_nodes(nullptr), my_nodes_last(nullptr), my_task_arena(nullptr) {
     prepare_task_arena();
     own_context = true;
     cancelled = false;
@@ -435,7 +541,7 @@ inline graph::graph() : my_wait_context(0), my_nodes(nullptr), my_nodes_last(nul
 }
 
 inline graph::graph(task_group_context& use_this_context) :
-    my_wait_context(0), my_context(&use_this_context), my_nodes(nullptr), my_nodes_last(nullptr), my_task_arena(nullptr) {
+    my_wait_context_vertex(0), my_context(&use_this_context), my_nodes(nullptr), my_nodes_last(nullptr), my_task_arena(nullptr) {
     prepare_task_arena();
     own_context = false;
     cancelled = false;
@@ -454,13 +560,13 @@ inline graph::~graph() {
 }
 
 inline void graph::reserve_wait() {
-    my_wait_context.reserve();
+    my_wait_context_vertex.reserve();
     fgt_reserve_wait(this);
 }
 
 inline void graph::release_wait() {
     fgt_release_wait(this);
-    my_wait_context.release();
+    my_wait_context_vertex.release();
 }
 
 inline void graph::register_node(graph_node *n) {
@@ -633,6 +739,18 @@ class input_node : public graph_node, public sender< Output > {
         }
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+private:
+    bool try_reserve( output_type& v, message_metainfo& ) override {
+        return try_reserve(v);
+    }
+
+    bool try_get( output_type& v, message_metainfo& ) override {
+        return try_get(v);
+    }
+public:
+#endif
+
     //! Release a reserved item.
     /** true = item has been released and so remains in sender, dest must request or reserve future items */
     bool try_release( ) override {
@@ -703,7 +821,7 @@ class input_node : public graph_node, public sender< Output > {
             return false;
         }
         if ( !my_has_cached_item ) {
-            flow_control control;
+            d1::flow_control control;
 
             fgt_begin_body( my_body );
 
@@ -722,10 +840,9 @@ class input_node : public graph_node, public sender< Output > {
     }
 
     graph_task* create_put_task() {
-        small_object_allocator allocator{};
+        d1::small_object_allocator allocator{};
         typedef input_node_task_bypass< input_node<output_type> > task_type;
         graph_task* t = allocator.new_object<task_type>(my_graph, allocator, *this);
-        my_graph.reserve_wait();
         return t;
     }
 
@@ -962,6 +1079,14 @@ class split_node : public graph_node, public receiver<TupleType> {
         // Also, we do not have successors here. So we just tell the task returned here is successful.
         return emit_element<N>::emit_this(this->my_graph, t, output_ports());
     }
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task(const TupleType& t, const message_metainfo& metainfo) override {
+        // Sending split messages in parallel is not justified, as overheads would prevail.
+        // Also, we do not have successors here. So we just tell the task returned here is successful.
+        return emit_element<N>::emit_this(this->my_graph, t, output_ports(), metainfo);
+    }
+#endif
+
     void reset_node(reset_flags f) override {
         if (f & rf_clear_edges)
             clear_element<N>::clear_this(my_output_ports);
@@ -1119,17 +1244,28 @@ class broadcast_node : public graph_node, public receiver<T>, public sender<T> {
         return true;
     }
 
+private:
+    graph_task* try_put_task_impl(const T& t __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) {
+        graph_task* new_task = my_successors.try_put_task(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
+        if (!new_task) new_task = SUCCESSFULLY_ENQUEUED;
+        return new_task;
+    }
+
 protected:
     template< typename R, typename B > friend class run_and_put_task;
     template<typename X, typename Y> friend class broadcast_cache;
     template<typename X, typename Y> friend class round_robin_cache;
     //! build a task to run the successor if possible.  Default is old behavior.
-    graph_task *try_put_task(const T& t) override {
-        graph_task *new_task = my_successors.try_put_task(t);
-        if (!new_task) new_task = SUCCESSFULLY_ENQUEUED;
-        return new_task;
+    graph_task* try_put_task(const T& t) override {
+        return try_put_task_impl(t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task(const T& t, const message_metainfo& metainfo) override {
+        return try_put_task_impl(t, metainfo);
+    }
+#endif
+
     graph& graph_reference() const override {
         return my_graph;
     }
@@ -1168,24 +1304,37 @@ class buffer_node
     };
 
     // implements the aggregator_operation concept
-    class buffer_operation : public aggregated_operation< buffer_operation > {
+    class buffer_operation : public d1::aggregated_operation< buffer_operation > {
     public:
         char type;
         T* elem;
         graph_task* ltask;
         successor_type *r;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        message_metainfo* metainfo{ nullptr };
+#endif
 
         buffer_operation(const T& e, op_type t) : type(char(t))
                                                   , elem(const_cast<T*>(&e)) , ltask(nullptr)
                                                   , r(nullptr)
         {}
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        buffer_operation(const T& e, op_type t, const message_metainfo& info)
+            : type(char(t)), elem(const_cast<T*>(&e)), ltask(nullptr), r(nullptr)
+            , metainfo(const_cast<message_metainfo*>(&info))
+        {}
+
+        buffer_operation(op_type t, message_metainfo& info)
+            : type(char(t)), elem(nullptr), ltask(nullptr), r(nullptr), metainfo(&info) {}
+#endif
         buffer_operation(op_type t) : type(char(t)), elem(nullptr), ltask(nullptr), r(nullptr) {}
     };
 
     bool forwarder_busy;
-    typedef aggregating_functor<class_type, buffer_operation> handler_type;
-    friend class aggregating_functor<class_type, buffer_operation>;
-    aggregator< handler_type, buffer_operation> my_aggregator;
+    typedef d1::aggregating_functor<class_type, buffer_operation> handler_type;
+    friend class d1::aggregating_functor<class_type, buffer_operation>;
+    d1::aggregator< handler_type, buffer_operation> my_aggregator;
 
     virtual void handle_operations(buffer_operation *op_list) {
         handle_operations_impl(op_list, this);
@@ -1218,9 +1367,8 @@ class buffer_node
             if(is_graph_active(this->my_graph)) {
                 forwarder_busy = true;
                 typedef forward_task_bypass<class_type> task_type;
-                small_object_allocator allocator{};
+                d1::small_object_allocator allocator{};
                 graph_task* new_task = allocator.new_object<task_type>(graph_reference(), allocator, *this);
-                my_graph.reserve_wait();
                 // tmp should point to the last item handled by the aggregator.  This is the operation
                 // the handling thread enqueued.  So modifying that record will be okay.
                 // TODO revamp: check that the issue is still present
@@ -1286,7 +1434,8 @@ class buffer_node
     }
 
     void try_put_and_add_task(graph_task*& last_task) {
-        graph_task *new_task = my_successors.try_put_task(this->back());
+        graph_task* new_task = my_successors.try_put_task(this->back()
+                                                          __TBB_FLOW_GRAPH_METAINFO_ARG(this->back_metainfo()));
         if (new_task) {
             // workaround for icc bug
             graph& g = this->my_graph;
@@ -1328,14 +1477,25 @@ class buffer_node
 
     virtual bool internal_push(buffer_operation *op) {
         __TBB_ASSERT(op->elem, nullptr);
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        __TBB_ASSERT(op->metainfo, nullptr);
+        this->push_back(*(op->elem), (*op->metainfo));
+#else
         this->push_back(*(op->elem));
+#endif
         op->status.store(SUCCEEDED, std::memory_order_release);
         return true;
     }
 
     virtual void internal_pop(buffer_operation *op) {
         __TBB_ASSERT(op->elem, nullptr);
-        if(this->pop_back(*(op->elem))) {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        bool pop_result = op->metainfo ? this->pop_back(*(op->elem), *(op->metainfo))
+                                       : this->pop_back(*(op->elem));
+#else
+        bool pop_result = this->pop_back(*(op->elem));
+#endif
+        if (pop_result) {
             op->status.store(SUCCEEDED, std::memory_order_release);
         }
         else {
@@ -1345,7 +1505,13 @@ class buffer_node
 
     virtual void internal_reserve(buffer_operation *op) {
         __TBB_ASSERT(op->elem, nullptr);
-        if(this->reserve_front(*(op->elem))) {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        bool reserve_result = op->metainfo ? this->reserve_front(*(op->elem), *(op->metainfo))
+                                           : this->reserve_front(*(op->elem));
+#else
+        bool reserve_result = this->reserve_front(*(op->elem));
+#endif
+        if (reserve_result) {
             op->status.store(SUCCEEDED, std::memory_order_release);
         }
         else {
@@ -1403,7 +1569,7 @@ class buffer_node
         It also calls r.remove_predecessor(*this) to remove this node as a predecessor. */
     bool remove_successor( successor_type &r ) override {
         // TODO revamp: investigate why full qualification is necessary here
-        tbb::detail::d1::remove_predecessor(r, *this);
+        tbb::detail::d2::remove_predecessor(r, *this);
         buffer_operation op_data(rem_succ);
         op_data.r = &r;
         my_aggregator.execute(&op_data);
@@ -1425,6 +1591,16 @@ class buffer_node
         return (op_data.status==SUCCEEDED);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    bool try_get( T &v, message_metainfo& metainfo ) override {
+        buffer_operation op_data(req_item, metainfo);
+        op_data.elem = &v;
+        my_aggregator.execute(&op_data);
+        (void)enqueue_forwarding_task(op_data);
+        return (op_data.status==SUCCEEDED);
+    }
+#endif
+
     //! Reserves an item.
     /**  false = no item can be reserved<BR>
          true = an item is reserved */
@@ -1436,6 +1612,16 @@ class buffer_node
         return (op_data.status==SUCCEEDED);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    bool try_reserve( output_type& v, message_metainfo& metainfo ) override {
+        buffer_operation op_data(res_item, metainfo);
+        op_data.elem = &v;
+        my_aggregator.execute(&op_data);
+        (void)enqueue_forwarding_task(op_data);
+        return op_data.status==SUCCEEDED;
+    }
+#endif
+
     //! Release a reserved item.
     /**  true = item has been released and so remains in sender */
     bool try_release() override {
@@ -1454,14 +1640,9 @@ class buffer_node
         return true;
     }
 
-protected:
-
-    template< typename R, typename B > friend class run_and_put_task;
-    template<typename X, typename Y> friend class broadcast_cache;
-    template<typename X, typename Y> friend class round_robin_cache;
-    //! receive an item, return a task *if possible
-    graph_task *try_put_task(const T &t) override {
-        buffer_operation op_data(t, put_item);
+private:
+    graph_task* try_put_task_impl(const T& t __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) {
+        buffer_operation op_data(t, put_item __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
         my_aggregator.execute(&op_data);
         graph_task *ft = grab_forwarding_task(op_data);
         // sequencer_nodes can return failure (if an item has been previously inserted)
@@ -1479,6 +1660,22 @@ class buffer_node
         return ft;
     }
 
+protected:
+
+    template< typename R, typename B > friend class run_and_put_task;
+    template<typename X, typename Y> friend class broadcast_cache;
+    template<typename X, typename Y> friend class round_robin_cache;
+    //! receive an item, return a task *if possible
+    graph_task *try_put_task(const T &t) override {
+        return try_put_task_impl(t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task(const T& t, const message_metainfo& metainfo) override {
+        return try_put_task_impl(t, metainfo);
+    }
+#endif
+
     graph& graph_reference() const override {
         return my_graph;
     }
@@ -1511,7 +1708,9 @@ class queue_node : public buffer_node<T> {
     }
 
     void try_put_and_add_task(graph_task*& last_task) {
-        graph_task *new_task = this->my_successors.try_put_task(this->front());
+        graph_task* new_task = this->my_successors.try_put_task(this->front()
+                                                                __TBB_FLOW_GRAPH_METAINFO_ARG(this->front_metainfo()));
+
         if (new_task) {
             // workaround for icc bug
             graph& graph_ref = this->graph_reference();
@@ -1530,7 +1729,14 @@ class queue_node : public buffer_node<T> {
             op->status.store(FAILED, std::memory_order_release);
         }
         else {
-            this->pop_front(*(op->elem));
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            if (op->metainfo) {
+                this->pop_front(*(op->elem), *(op->metainfo));
+            } else
+#endif
+            {
+                this->pop_front(*(op->elem));
+            }
             op->status.store(SUCCEEDED, std::memory_order_release);
         }
     }
@@ -1539,7 +1745,15 @@ class queue_node : public buffer_node<T> {
             op->status.store(FAILED, std::memory_order_release);
         }
         else {
-            this->reserve_front(*(op->elem));
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            if (op->metainfo) {
+                this->reserve_front(*(op->elem), *(op->metainfo));
+            }
+            else
+#endif
+            {
+                this->reserve_front(*(op->elem));
+            }
             op->status.store(SUCCEEDED, std::memory_order_release);
         }
     }
@@ -1647,7 +1861,13 @@ class sequencer_node : public queue_node<T> {
         }
         this->my_tail = new_tail;
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        __TBB_ASSERT(op->metainfo, nullptr);
+        bool place_item_result = this->place_item(tag, *(op->elem), *(op->metainfo));
+        const op_stat res = place_item_result ? SUCCEEDED : FAILED;
+#else
         const op_stat res = this->place_item(tag, *(op->elem)) ? SUCCEEDED : FAILED;
+#endif
         op->status.store(res, std::memory_order_release);
         return res ==SUCCEEDED;
     }
@@ -1710,7 +1930,12 @@ class priority_queue_node : public buffer_node<T> {
     }
 
     bool internal_push(prio_operation *op) override {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        __TBB_ASSERT(op->metainfo, nullptr);
+        prio_push(*(op->elem), *(op->metainfo));
+#else
         prio_push(*(op->elem));
+#endif
         op->status.store(SUCCEEDED, std::memory_order_release);
         return true;
     }
@@ -1723,6 +1948,11 @@ class priority_queue_node : public buffer_node<T> {
         }
 
         *(op->elem) = prio();
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        if (op->metainfo) {
+            *(op->metainfo) = std::move(prio_metainfo());
+        }
+#endif
         op->status.store(SUCCEEDED, std::memory_order_release);
         prio_pop();
 
@@ -1736,6 +1966,12 @@ class priority_queue_node : public buffer_node<T> {
         }
         this->my_reserved = true;
         *(op->elem) = prio();
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        if (op->metainfo) {
+            *(op->metainfo) = std::move(prio_metainfo());
+            reserved_metainfo = *(op->metainfo);
+        }
+#endif
         reserved_item = *(op->elem);
         op->status.store(SUCCEEDED, std::memory_order_release);
         prio_pop();
@@ -1745,13 +1981,27 @@ class priority_queue_node : public buffer_node<T> {
         op->status.store(SUCCEEDED, std::memory_order_release);
         this->my_reserved = false;
         reserved_item = input_type();
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        for (auto waiter : reserved_metainfo.waiters()) {
+            waiter->release(1);
+        }
+
+        reserved_metainfo = message_metainfo{};
+#endif
     }
 
     void internal_release(prio_operation *op) override {
         op->status.store(SUCCEEDED, std::memory_order_release);
-        prio_push(reserved_item);
+        prio_push(reserved_item __TBB_FLOW_GRAPH_METAINFO_ARG(reserved_metainfo));
         this->my_reserved = false;
         reserved_item = input_type();
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        for (auto waiter : reserved_metainfo.waiters()) {
+            waiter->release(1);
+        }
+
+        reserved_metainfo = message_metainfo{};
+#endif
     }
 
 private:
@@ -1767,7 +2017,8 @@ class priority_queue_node : public buffer_node<T> {
     }
 
     void try_put_and_add_task(graph_task*& last_task) {
-        graph_task * new_task = this->my_successors.try_put_task(this->prio());
+        graph_task* new_task = this->my_successors.try_put_task(this->prio()
+                                                                __TBB_FLOW_GRAPH_METAINFO_ARG(this->prio_metainfo()));
         if (new_task) {
             // workaround for icc bug
             graph& graph_ref = this->graph_reference();
@@ -1781,6 +2032,9 @@ class priority_queue_node : public buffer_node<T> {
     size_type mark;
 
     input_type reserved_item;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    message_metainfo reserved_metainfo;
+#endif
 
     // in case a reheap has not been done after a push, check if the mark item is higher than the 0'th item
     bool prio_use_tail() {
@@ -1789,10 +2043,10 @@ class priority_queue_node : public buffer_node<T> {
     }
 
     // prio_push: checks that the item will fit, expand array if necessary, put at end
-    void prio_push(const T &src) {
+    void prio_push(const T &src __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) {
         if ( this->my_tail >= this->my_array_size )
             this->grow_my_array( this->my_tail + 1 );
-        (void) this->place_item(this->my_tail, src);
+        (void) this->place_item(this->my_tail, src __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
         ++(this->my_tail);
         __TBB_ASSERT(mark < this->my_tail, "mark outside bounds after push");
     }
@@ -1826,6 +2080,12 @@ class priority_queue_node : public buffer_node<T> {
         return this->get_my_item(prio_use_tail() ? this->my_tail-1 : 0);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    message_metainfo& prio_metainfo() {
+        return this->get_my_metainfo(prio_use_tail() ? this->my_tail-1 : 0);
+    }
+#endif
+
     // turn array into heap
     void heapify() {
         if(this->my_tail == 0) {
@@ -1836,7 +2096,10 @@ class priority_queue_node : public buffer_node<T> {
         for (; mark<this->my_tail; ++mark) { // for each unheaped element
             size_type cur_pos = mark;
             input_type to_place;
-            this->fetch_item(mark,to_place);
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            message_metainfo metainfo;
+#endif
+            this->fetch_item(mark, to_place __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
             do { // push to_place up the heap
                 size_type parent = (cur_pos-1)>>1;
                 if (!compare(this->get_my_item(parent), to_place))
@@ -1844,7 +2107,7 @@ class priority_queue_node : public buffer_node<T> {
                 this->move_item(cur_pos, parent);
                 cur_pos = parent;
             } while( cur_pos );
-            (void) this->place_item(cur_pos, to_place);
+            this->place_item(cur_pos, to_place __TBB_FLOW_GRAPH_METAINFO_ARG(std::move(metainfo)));
         }
     }
 
@@ -1944,9 +2207,12 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
         //SUCCESS
         // if we can reserve and can put, we consume the reservation
         // we increment the count and decrement the tries
-        if ( (my_predecessors.try_reserve(v)) == true ) {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        message_metainfo metainfo;
+#endif
+        if ( (my_predecessors.try_reserve(v __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo))) == true ) {
             reserved = true;
-            if ( (rval = my_successors.try_put_task(v)) != nullptr ) {
+            if ( (rval = my_successors.try_put_task(v __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo))) != nullptr ) {
                 {
                     spin_mutex::scoped_lock lock(my_mutex);
                     ++my_count;
@@ -1965,9 +2231,8 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
                     if ( check_conditions() ) {
                         if ( is_graph_active(this->my_graph) ) {
                             typedef forward_task_bypass<limiter_node<T, DecrementType>> task_type;
-                            small_object_allocator allocator{};
+                            d1::small_object_allocator allocator{};
                             graph_task* rtask = allocator.new_object<task_type>( my_graph, allocator, *this );
-                            my_graph.reserve_wait();
                             spawn_in_graph_arena(graph_reference(), *rtask);
                         }
                     }
@@ -1984,10 +2249,9 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
             if (reserved) my_predecessors.try_release();
             if ( check_conditions() ) {
                 if ( is_graph_active(this->my_graph) ) {
-                    small_object_allocator allocator{};
+                    d1::small_object_allocator allocator{};
                     typedef forward_task_bypass<limiter_node<T, DecrementType>> task_type;
                     graph_task* t = allocator.new_object<task_type>(my_graph, allocator, *this);
-                    my_graph.reserve_wait();
                     __TBB_ASSERT(!rval, "Have two tasks to handle");
                     return t;
                 }
@@ -2035,10 +2299,9 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
         //spawn a forward task if this is the only successor
         if ( was_empty && !my_predecessors.empty() && my_count + my_tries < my_threshold ) {
             if ( is_graph_active(this->my_graph) ) {
-                small_object_allocator allocator{};
+                d1::small_object_allocator allocator{};
                 typedef forward_task_bypass<limiter_node<T, DecrementType>> task_type;
                 graph_task* t = allocator.new_object<task_type>(my_graph, allocator, *this);
-                my_graph.reserve_wait();
                 spawn_in_graph_arena(graph_reference(), *t);
             }
         }
@@ -2049,7 +2312,7 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
     /** r.remove_predecessor(*this) is also called. */
     bool remove_successor( successor_type &r ) override {
         // TODO revamp: investigate why qualification is needed for remove_predecessor() call
-        tbb::detail::d1::remove_predecessor(r, *this);
+        tbb::detail::d2::remove_predecessor(r, *this);
         my_successors.remove_successor(r);
         return true;
     }
@@ -2059,10 +2322,9 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
         spin_mutex::scoped_lock lock(my_mutex);
         my_predecessors.add( src );
         if ( my_count + my_tries < my_threshold && !my_successors.empty() && is_graph_active(this->my_graph) ) {
-            small_object_allocator allocator{};
+            d1::small_object_allocator allocator{};
             typedef forward_task_bypass<limiter_node<T, DecrementType>> task_type;
             graph_task* t = allocator.new_object<task_type>(my_graph, allocator, *this);
-            my_graph.reserve_wait();
             spawn_in_graph_arena(graph_reference(), *t);
         }
         return true;
@@ -2079,8 +2341,10 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
     template< typename R, typename B > friend class run_and_put_task;
     template<typename X, typename Y> friend class broadcast_cache;
     template<typename X, typename Y> friend class round_robin_cache;
+
+private:
     //! Puts an item to this receiver
-    graph_task* try_put_task( const T &t ) override {
+    graph_task* try_put_task_impl( const T &t __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo) ) {
         {
             spin_mutex::scoped_lock lock(my_mutex);
             if ( my_count + my_tries >= my_threshold )
@@ -2089,15 +2353,14 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
                 ++my_tries;
         }
 
-        graph_task* rtask = my_successors.try_put_task(t);
+        graph_task* rtask = my_successors.try_put_task(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
         if ( !rtask ) {  // try_put_task failed.
             spin_mutex::scoped_lock lock(my_mutex);
             --my_tries;
             if (check_conditions() && is_graph_active(this->my_graph)) {
-                small_object_allocator allocator{};
+                d1::small_object_allocator allocator{};
                 typedef forward_task_bypass<limiter_node<T, DecrementType>> task_type;
                 rtask = allocator.new_object<task_type>(my_graph, allocator, *this);
-                my_graph.reserve_wait();
             }
         }
         else {
@@ -2118,6 +2381,16 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
         return rtask;
     }
 
+protected:
+    graph_task* try_put_task(const T& t) override {
+        return try_put_task_impl(t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
+    }
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task(const T& t, const message_metainfo& metainfo) override {
+        return try_put_task_impl(t, metainfo);
+    }
+#endif
+
     graph& graph_reference() const override { return my_graph; }
 
     void reset_node( reset_flags f ) override {
@@ -3054,10 +3327,9 @@ class overwrite_node : public graph_node, public receiver<T>, public sender<T> {
                 // because failed reserve does not mean that register_successor is not ready to put a message immediately.
                 // We have some sort of infinite loop: reserving node tries to set pull state for the edge,
                 // but overwrite_node tries to return push state back. That is why we have to break this loop with task creation.
-                small_object_allocator allocator{};
+                d1::small_object_allocator allocator{};
                 typedef register_predecessor_task task_type;
                 graph_task* t = allocator.new_object<task_type>(graph_reference(), allocator, *this, s);
-                graph_reference().reserve_wait();
                 spawn_in_graph_arena( my_graph, *t );
             }
         } else {
@@ -3082,11 +3354,45 @@ class overwrite_node : public graph_node, public receiver<T>, public sender<T> {
         return false;
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    bool try_get( input_type &v, message_metainfo& metainfo ) override {
+        spin_mutex::scoped_lock l( my_mutex );
+        if (my_buffer_is_valid) {
+            v = my_buffer;
+            metainfo = my_buffered_metainfo;
+
+            // Since the successor of the node will use move semantics while wrapping the metainfo
+            // that is designed to transfer the ownership of the value from single-push buffer to the task
+            // It is required to reserve one more reference here because the value keeps in the buffer
+            // and the ownership is not transferred
+            for (auto msg_waiter : metainfo.waiters()) {
+                msg_waiter->reserve(1);
+            }
+            return true;
+        }
+        return false;
+    }
+#endif
+
     //! Reserves an item
     bool try_reserve( T &v ) override {
         return try_get(v);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+private:
+    bool try_reserve(T& v, message_metainfo& metainfo) override {
+        spin_mutex::scoped_lock l( my_mutex );
+        if (my_buffer_is_valid) {
+            v = my_buffer;
+            metainfo = my_buffered_metainfo;
+            return true;
+        }
+        return false;
+    }
+public:
+#endif
+
     //! Releases the reserved item
     bool try_release() override { return true; }
 
@@ -3101,6 +3407,12 @@ class overwrite_node : public graph_node, public receiver<T>, public sender<T> {
     void clear() {
        spin_mutex::scoped_lock l( my_mutex );
        my_buffer_is_valid = false;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+       for (auto msg_waiter : my_buffered_metainfo.waiters()) {
+           msg_waiter->release(1);
+       }
+       my_buffered_metainfo = message_metainfo{};
+#endif
     }
 
 protected:
@@ -3110,13 +3422,33 @@ class overwrite_node : public graph_node, public receiver<T>, public sender<T> {
     template<typename X, typename Y> friend class round_robin_cache;
     graph_task* try_put_task( const input_type &v ) override {
         spin_mutex::scoped_lock l( my_mutex );
-        return try_put_task_impl(v);
+        return try_put_task_impl(v __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task(const input_type& v, const message_metainfo& metainfo) override {
+        spin_mutex::scoped_lock l( my_mutex );
+        return try_put_task_impl(v, metainfo);
     }
+#endif
 
-    graph_task * try_put_task_impl(const input_type &v) {
+    graph_task * try_put_task_impl(const input_type &v __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) {
         my_buffer = v;
         my_buffer_is_valid = true;
-        graph_task* rtask = my_successors.try_put_task(v);
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        // Since the new item is pushed to the buffer - reserving the waiters
+        for (auto msg_waiter : metainfo.waiters()) {
+            msg_waiter->reserve(1);
+        }
+
+        // Since the item is taken out from the buffer - releasing the stored waiters
+        for (auto msg_waiter : my_buffered_metainfo.waiters()) {
+            msg_waiter->release(1);
+        }
+
+        my_buffered_metainfo = metainfo;
+#endif
+        graph_task* rtask = my_successors.try_put_task(v __TBB_FLOW_GRAPH_METAINFO_ARG(my_buffered_metainfo) );
         if (!rtask) rtask = SUCCESSFULLY_ENQUEUED;
         return rtask;
     }
@@ -3128,13 +3460,13 @@ class overwrite_node : public graph_node, public receiver<T>, public sender<T> {
     //! Breaks an infinite loop between the node reservation and register_successor call
     struct register_predecessor_task : public graph_task {
         register_predecessor_task(
-            graph& g, small_object_allocator& allocator, predecessor_type& owner, successor_type& succ)
+            graph& g, d1::small_object_allocator& allocator, predecessor_type& owner, successor_type& succ)
             : graph_task(g, allocator), o(owner), s(succ) {};
 
-        task* execute(execution_data& ed) override {
+        d1::task* execute(d1::execution_data& ed) override {
             // TODO revamp: investigate why qualification is needed for register_successor() call
-            using tbb::detail::d1::register_predecessor;
-            using tbb::detail::d1::register_successor;
+            using tbb::detail::d2::register_predecessor;
+            using tbb::detail::d2::register_successor;
             if ( !register_predecessor(s, o) ) {
                 register_successor(o, s);
             }
@@ -3142,7 +3474,7 @@ class overwrite_node : public graph_node, public receiver<T>, public sender<T> {
             return nullptr;
         }
 
-        task* cancel(execution_data& ed) override {
+        d1::task* cancel(d1::execution_data& ed) override {
             finalize<register_predecessor_task>(ed);
             return nullptr;
         }
@@ -3154,6 +3486,9 @@ class overwrite_node : public graph_node, public receiver<T>, public sender<T> {
     spin_mutex my_mutex;
     broadcast_cache< input_type, null_rw_mutex > my_successors;
     input_type my_buffer;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    message_metainfo my_buffered_metainfo;
+#endif
     bool my_buffer_is_valid;
 
     void reset_node( reset_flags f) override {
@@ -3200,8 +3535,15 @@ class write_once_node : public overwrite_node<T> {
     template<typename X, typename Y> friend class round_robin_cache;
     graph_task *try_put_task( const T &v ) override {
         spin_mutex::scoped_lock l( this->my_mutex );
-        return this->my_buffer_is_valid ? nullptr : this->try_put_task_impl(v);
+        return this->my_buffer_is_valid ? nullptr : this->try_put_task_impl(v __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
     }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task(const T& v, const message_metainfo& metainfo) override {
+        spin_mutex::scoped_lock l( this->my_mutex );
+        return this->my_buffer_is_valid ? nullptr : this->try_put_task_impl(v, metainfo);
+    }
+#endif
 }; // write_once_node
 
 inline void set_name(const graph& g, const char *name) {
@@ -3293,7 +3635,7 @@ inline void set_name(const async_node<Input, Output, Policy>& node, const char *
 {
     fgt_multioutput_node_desc(&node, name);
 }
-} // d1
+} // d2
 } // detail
 } // tbb
 
@@ -3304,56 +3646,56 @@ inline void set_name(const async_node<Input, Output, Policy>& node, const char *
 namespace tbb {
 namespace flow {
 inline namespace v1 {
-    using detail::d1::receiver;
-    using detail::d1::sender;
-
-    using detail::d1::serial;
-    using detail::d1::unlimited;
-
-    using detail::d1::reset_flags;
-    using detail::d1::rf_reset_protocol;
-    using detail::d1::rf_reset_bodies;
-    using detail::d1::rf_clear_edges;
-
-    using detail::d1::graph;
-    using detail::d1::graph_node;
-    using detail::d1::continue_msg;
-
-    using detail::d1::input_node;
-    using detail::d1::function_node;
-    using detail::d1::multifunction_node;
-    using detail::d1::split_node;
-    using detail::d1::output_port;
-    using detail::d1::indexer_node;
-    using detail::d1::tagged_msg;
-    using detail::d1::cast_to;
-    using detail::d1::is_a;
-    using detail::d1::continue_node;
-    using detail::d1::overwrite_node;
-    using detail::d1::write_once_node;
-    using detail::d1::broadcast_node;
-    using detail::d1::buffer_node;
-    using detail::d1::queue_node;
-    using detail::d1::sequencer_node;
-    using detail::d1::priority_queue_node;
-    using detail::d1::limiter_node;
-    using namespace detail::d1::graph_policy_namespace;
-    using detail::d1::join_node;
-    using detail::d1::input_port;
-    using detail::d1::copy_body;
-    using detail::d1::make_edge;
-    using detail::d1::remove_edge;
-    using detail::d1::tag_value;
-    using detail::d1::composite_node;
-    using detail::d1::async_node;
-    using detail::d1::node_priority_t;
-    using detail::d1::no_priority;
+    using detail::d2::receiver;
+    using detail::d2::sender;
+
+    using detail::d2::serial;
+    using detail::d2::unlimited;
+
+    using detail::d2::reset_flags;
+    using detail::d2::rf_reset_protocol;
+    using detail::d2::rf_reset_bodies;
+    using detail::d2::rf_clear_edges;
+
+    using detail::d2::graph;
+    using detail::d2::graph_node;
+    using detail::d2::continue_msg;
+
+    using detail::d2::input_node;
+    using detail::d2::function_node;
+    using detail::d2::multifunction_node;
+    using detail::d2::split_node;
+    using detail::d2::output_port;
+    using detail::d2::indexer_node;
+    using detail::d2::tagged_msg;
+    using detail::d2::cast_to;
+    using detail::d2::is_a;
+    using detail::d2::continue_node;
+    using detail::d2::overwrite_node;
+    using detail::d2::write_once_node;
+    using detail::d2::broadcast_node;
+    using detail::d2::buffer_node;
+    using detail::d2::queue_node;
+    using detail::d2::sequencer_node;
+    using detail::d2::priority_queue_node;
+    using detail::d2::limiter_node;
+    using namespace detail::d2::graph_policy_namespace;
+    using detail::d2::join_node;
+    using detail::d2::input_port;
+    using detail::d2::copy_body;
+    using detail::d2::make_edge;
+    using detail::d2::remove_edge;
+    using detail::d2::tag_value;
+    using detail::d2::composite_node;
+    using detail::d2::async_node;
+    using detail::d2::node_priority_t;
+    using detail::d2::no_priority;
 
 #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
-    using detail::d1::follows;
-    using detail::d1::precedes;
-    using detail::d1::make_node_set;
-    using detail::d1::make_edges;
+    using detail::d2::follows;
+    using detail::d2::precedes;
+    using detail::d2::make_node_set;
+    using detail::d2::make_edges;
 #endif
 
 } // v1
@@ -3362,7 +3704,7 @@ inline namespace v1 {
     using detail::d1::flow_control;
 
 namespace profiling {
-    using detail::d1::set_name;
+    using detail::d2::set_name;
 } // profiling
 
 } // tbb
diff --git a/third-party/tbb/include/oneapi/tbb/flow_graph_abstractions.h b/third-party/tbb/include/oneapi/tbb/flow_graph_abstractions.h
index 121f167c4d..329e75c43e 100644
--- a/third-party/tbb/include/oneapi/tbb/flow_graph_abstractions.h
+++ b/third-party/tbb/include/oneapi/tbb/flow_graph_abstractions.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 
 namespace tbb {
 namespace detail {
-namespace d1 {
+namespace d2 {
 
 //! Pure virtual template classes that define interfaces for async communication
 class graph_proxy {
@@ -43,7 +43,7 @@ class receiver_gateway : public graph_proxy {
     virtual bool try_put(const input_type&) = 0;
 };
 
-} // d1
+} // d2
 
 
 } // detail
diff --git a/third-party/tbb/include/oneapi/tbb/memory_pool.h b/third-party/tbb/include/oneapi/tbb/memory_pool.h
index b2e6b05191..5ece879002 100644
--- a/third-party/tbb/include/oneapi/tbb/memory_pool.h
+++ b/third-party/tbb/include/oneapi/tbb/memory_pool.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -97,10 +97,10 @@ class memory_pool_allocator {
         typedef memory_pool_allocator<U, P> other;
     };
 
-    explicit memory_pool_allocator(pool_type &pool) throw() : my_pool(&pool) {}
-    memory_pool_allocator(const memory_pool_allocator& src) throw() : my_pool(src.my_pool) {}
+    explicit memory_pool_allocator(pool_type &pool) noexcept : my_pool(&pool) {}
+    memory_pool_allocator(const memory_pool_allocator& src) noexcept : my_pool(src.my_pool) {}
     template<typename U>
-    memory_pool_allocator(const memory_pool_allocator<U,P>& src) throw() : my_pool(src.my_pool) {}
+    memory_pool_allocator(const memory_pool_allocator<U,P>& src) noexcept : my_pool(src.my_pool) {}
 
     pointer address(reference x) const { return &x; }
     const_pointer address(const_reference x) const { return &x; }
@@ -117,7 +117,7 @@ class memory_pool_allocator {
         my_pool->free(p);
     }
     //! Largest value for which method allocate might succeed.
-    size_type max_size() const throw() {
+    size_type max_size() const noexcept {
         size_type max = static_cast<size_type>(-1) / sizeof (value_type);
         return (max > 0 ? max : 1);
     }
@@ -149,10 +149,10 @@ class memory_pool_allocator<void, P> {
         typedef memory_pool_allocator<U, P> other;
     };
 
-    explicit memory_pool_allocator( pool_type &pool) throw() : my_pool(&pool) {}
-    memory_pool_allocator( const memory_pool_allocator& src) throw() : my_pool(src.my_pool) {}
+    explicit memory_pool_allocator( pool_type &pool) noexcept : my_pool(&pool) {}
+    memory_pool_allocator( const memory_pool_allocator& src) noexcept : my_pool(src.my_pool) {}
     template<typename U>
-    memory_pool_allocator(const memory_pool_allocator<U,P>& src) throw() : my_pool(src.my_pool) {}
+    memory_pool_allocator(const memory_pool_allocator<U,P>& src) noexcept : my_pool(src.my_pool) {}
 
 protected:
     pool_type *my_pool;
diff --git a/third-party/tbb/include/oneapi/tbb/parallel_for_each.h b/third-party/tbb/include/oneapi/tbb/parallel_for_each.h
index ab0b345388..85c0269196 100644
--- a/third-party/tbb/include/oneapi/tbb/parallel_for_each.h
+++ b/third-party/tbb/include/oneapi/tbb/parallel_for_each.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -118,14 +118,17 @@ struct feeder_item_task: public task {
     using feeder_type = feeder_impl<Body, Item>;
 
     template <typename ItemType>
-    feeder_item_task(ItemType&& input_item, feeder_type& feeder, small_object_allocator& alloc) :
+    feeder_item_task(ItemType&& input_item, feeder_type& feeder, small_object_allocator& alloc, wait_tree_vertex_interface& wait_vertex) :
         item(std::forward<ItemType>(input_item)),
         my_feeder(feeder),
-        my_allocator(alloc)
-    {}
+        my_allocator(alloc),
+        m_wait_tree_vertex(r1::get_thread_reference_vertex(&wait_vertex))
+    {
+        m_wait_tree_vertex->reserve();
+    }
 
     void finalize(const execution_data& ed) {
-        my_feeder.my_wait_context.release();
+        m_wait_tree_vertex->release();
         my_allocator.delete_object(this, ed);
     }
 
@@ -160,6 +163,7 @@ struct feeder_item_task: public task {
     Item item;
     feeder_type& my_feeder;
     small_object_allocator my_allocator;
+    wait_tree_vertex_interface* m_wait_tree_vertex;
 }; // class feeder_item_task
 
 /** Implements new task adding procedure.
@@ -170,9 +174,8 @@ class feeder_impl : public feeder<Item> {
     void internal_add_copy_impl(std::true_type, const Item& item) {
         using feeder_task = feeder_item_task<Body, Item>;
         small_object_allocator alloc;
-        auto task = alloc.new_object<feeder_task>(item, *this, alloc);
+        auto task = alloc.new_object<feeder_task>(item, *this, alloc, my_wait_context);
 
-        my_wait_context.reserve();
         spawn(*task, my_execution_context);
     }
 
@@ -187,20 +190,19 @@ class feeder_impl : public feeder<Item> {
     void internal_add_move(Item&& item) override {
         using feeder_task = feeder_item_task<Body, Item>;
         small_object_allocator alloc{};
-        auto task = alloc.new_object<feeder_task>(std::move(item), *this, alloc);
+        auto task = alloc.new_object<feeder_task>(std::move(item), *this, alloc, my_wait_context);
 
-        my_wait_context.reserve();
         spawn(*task, my_execution_context);
     }
 public:
-    feeder_impl(const Body& body, wait_context& w_context, task_group_context &context)
+    feeder_impl(const Body& body, wait_context_vertex& w_context, task_group_context &context)
       : my_body(body),
         my_wait_context(w_context)
       , my_execution_context(context)
     {}
 
     const Body& my_body;
-    wait_context& my_wait_context;
+    wait_context_vertex& my_wait_context;
     task_group_context& my_execution_context;
 }; // class feeder_impl
 
@@ -263,7 +265,7 @@ struct input_block_handling_task : public task {
     using iteration_task_iterator_type = typename input_iteration_task_iterator_helper<Body, Item>::type;
     using iteration_task = for_each_iteration_task<iteration_task_iterator_type, Body, Item>;
 
-    input_block_handling_task(wait_context& root_wait_context, task_group_context& e_context,
+    input_block_handling_task(wait_context_vertex& root_wait_context, task_group_context& e_context,
                               const Body& body, feeder_impl<Body, Item>* feeder_ptr, small_object_allocator& alloc)
         :my_size(0), my_wait_context(0), my_root_wait_context(root_wait_context),
          my_execution_context(e_context), my_allocator(alloc)
@@ -312,7 +314,7 @@ struct input_block_handling_task : public task {
     aligned_space<iteration_task, max_block_size> task_pool;
     std::size_t my_size;
     wait_context my_wait_context;
-    wait_context& my_root_wait_context;
+    wait_context_vertex& my_root_wait_context;
     task_group_context& my_execution_context;
     small_object_allocator my_allocator;
 }; // class input_block_handling_task
@@ -326,7 +328,7 @@ struct forward_block_handling_task : public task {
     using iteration_task = for_each_iteration_task<Iterator, Body, Item>;
 
     forward_block_handling_task(Iterator first, std::size_t size,
-                                wait_context& w_context, task_group_context& e_context,
+                                wait_context_vertex& w_context, task_group_context& e_context,
                                 const Body& body, feeder_impl<Body, Item>* feeder_ptr,
                                 small_object_allocator& alloc)
         : my_size(size), my_wait_context(0), my_root_wait_context(w_context),
@@ -373,7 +375,7 @@ struct forward_block_handling_task : public task {
     aligned_space<iteration_task, max_block_size> task_pool;
     std::size_t my_size;
     wait_context my_wait_context;
-    wait_context& my_root_wait_context;
+    wait_context_vertex& my_root_wait_context;
     task_group_context& my_execution_context;
     small_object_allocator my_allocator;
 }; // class forward_block_handling_task
@@ -456,7 +458,7 @@ using feeder_is_required = tbb::detail::void_t<decltype(tbb::detail::invoke(std:
 // Creates feeder object only if the body can accept it
 template <typename Iterator, typename Body, typename Item, typename = void>
 struct feeder_holder {
-    feeder_holder( wait_context&, task_group_context&, const Body& ) {}
+    feeder_holder( wait_context_vertex&, task_group_context&, const Body& ) {}
 
     feeder_impl<Body, Item>* feeder_ptr() { return nullptr; }
 }; // class feeder_holder
@@ -464,7 +466,7 @@ struct feeder_holder {
 template <typename Iterator, typename Body, typename Item>
 class feeder_holder<Iterator, Body, Item, feeder_is_required<Body, Iterator, Item>> {
 public:
-    feeder_holder( wait_context& w_context, task_group_context& context, const Body& body )
+    feeder_holder( wait_context_vertex& w_context, task_group_context& context, const Body& body )
         : my_feeder(body, w_context, context) {}
 
     feeder_impl<Body, Item>* feeder_ptr() { return &my_feeder; }
@@ -475,7 +477,7 @@ class feeder_holder<Iterator, Body, Item, feeder_is_required<Body, Iterator, Ite
 template <typename Iterator, typename Body, typename Item>
 class for_each_root_task_base : public task {
 public:
-    for_each_root_task_base(Iterator first, Iterator last, const Body& body, wait_context& w_context, task_group_context& e_context)
+    for_each_root_task_base(Iterator first, Iterator last, const Body& body, wait_context_vertex& w_context, task_group_context& e_context)
         : my_first(first), my_last(last), my_wait_context(w_context), my_execution_context(e_context),
           my_body(body), my_feeder_holder(my_wait_context, my_execution_context, my_body)
     {
@@ -489,7 +491,7 @@ class for_each_root_task_base : public task {
 protected:
     Iterator my_first;
     Iterator my_last;
-    wait_context& my_wait_context;
+    wait_context_vertex& my_wait_context;
     task_group_context& my_execution_context;
     const Body& my_body;
     feeder_holder<Iterator, Body, Item> my_feeder_holder;
@@ -624,11 +626,11 @@ void run_parallel_for_each( Iterator first, Iterator last, const Body& body, tas
 {
     if (!(first == last)) {
         using ItemType = get_item_type<Body, typename std::iterator_traits<Iterator>::value_type>;
-        wait_context w_context(0);
+        wait_context_vertex w_context(0);
 
         for_each_root_task<Iterator, Body, ItemType> root_task(first, last, body, w_context, context);
 
-        execute_and_wait(root_task, context, w_context, context);
+        execute_and_wait(root_task, context, w_context.get_context(), context);
     }
 }
 
diff --git a/third-party/tbb/include/oneapi/tbb/task_group.h b/third-party/tbb/include/oneapi/tbb/task_group.h
index 04e241f607..c0811c8502 100644
--- a/third-party/tbb/include/oneapi/tbb/task_group.h
+++ b/third-party/tbb/include/oneapi/tbb/task_group.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -45,7 +45,6 @@ namespace d1 {
 class delegate_base;
 class task_arena_base;
 class task_group_context;
-class task_group_base;
 }
 
 namespace r1 {
@@ -97,8 +96,8 @@ class function_task : public task_handle_task  {
     }
 public:
     template<typename FF>
-    function_task(FF&& f, d1::wait_context& wo, d1::task_group_context& ctx, d1::small_object_allocator& alloc)
-        : task_handle_task{wo, ctx, alloc},
+    function_task(FF&& f, d1::wait_tree_vertex_interface* vertex, d1::task_group_context& ctx, d1::small_object_allocator& alloc)
+        : task_handle_task{vertex, ctx, alloc},
           m_func(std::forward<FF>(f)) {}
 };
 
@@ -414,11 +413,20 @@ class task_group_context : no_copy {
     friend class r1::context_guard_helper;
     friend struct r1::task_arena_impl;
     friend struct r1::task_group_context_impl;
-    friend class task_group_base;
+    friend class d2::task_group_base;
 }; // class task_group_context
 
 static_assert(sizeof(task_group_context) == 128, "Wrong size of task_group_context");
 
+inline bool is_current_task_group_canceling() {
+    task_group_context* ctx = current_context();
+    return ctx ? ctx->is_group_execution_cancelled() : false;
+}
+
+} // namespace d1
+
+namespace d2 {
+
 enum task_group_status {
     not_complete,
     complete,
@@ -431,77 +439,41 @@ class structured_task_group;
 class isolated_task_group;
 #endif
 
-template<typename F>
-class function_task : public task {
-    const F m_func;
-    wait_context& m_wait_ctx;
-    small_object_allocator m_allocator;
-
-    void finalize(const execution_data& ed) {
-        // Make a local reference not to access this after destruction.
-        wait_context& wo = m_wait_ctx;
-        // Copy allocator to the stack
-        auto allocator = m_allocator;
-        // Destroy user functor before release wait.
-        this->~function_task();
-        wo.release();
-
-        allocator.deallocate(this, ed);
-    }
-    task* execute(execution_data& ed) override {
-        task* res = d2::task_ptr_or_nullptr(m_func);
-        finalize(ed);
-        return res;
-    }
-    task* cancel(execution_data& ed) override {
-        finalize(ed);
-        return nullptr;
-    }
-public:
-    function_task(const F& f, wait_context& wo, small_object_allocator& alloc)
-        : m_func(f)
-        , m_wait_ctx(wo)
-        , m_allocator(alloc) {}
-
-    function_task(F&& f, wait_context& wo, small_object_allocator& alloc)
-        : m_func(std::move(f))
-        , m_wait_ctx(wo)
-        , m_allocator(alloc) {}
-};
-
 template <typename F>
-class function_stack_task : public task {
+class function_stack_task : public d1::task {
     const F& m_func;
-    wait_context& m_wait_ctx;
+    d1::wait_tree_vertex_interface* m_wait_tree_vertex;
 
     void finalize() {
-        m_wait_ctx.release();
+        m_wait_tree_vertex->release();
     }
-    task* execute(execution_data&) override {
+    task* execute(d1::execution_data&) override {
         task* res = d2::task_ptr_or_nullptr(m_func);
         finalize();
         return res;
     }
-    task* cancel(execution_data&) override {
+    task* cancel(d1::execution_data&) override {
         finalize();
         return nullptr;
     }
 public:
-    function_stack_task(const F& f, wait_context& wo) : m_func(f), m_wait_ctx(wo) {}
+    function_stack_task(const F& f, d1::wait_tree_vertex_interface* vertex) : m_func(f), m_wait_tree_vertex(vertex) {
+        m_wait_tree_vertex->reserve();
+    }
 };
 
 class task_group_base : no_copy {
 protected:
-    wait_context m_wait_ctx;
-    task_group_context m_context;
+    d1::wait_context_vertex m_wait_vertex;
+    d1::task_group_context m_context;
 
     template<typename F>
     task_group_status internal_run_and_wait(const F& f) {
-        function_stack_task<F> t{ f, m_wait_ctx };
-        m_wait_ctx.reserve();
+        function_stack_task<F> t{ f, r1::get_thread_reference_vertex(&m_wait_vertex) };
+
         bool cancellation_status = false;
         try_call([&] {
-            execute_and_wait(t, context(), m_wait_ctx, context());
+            execute_and_wait(t, context(), m_wait_vertex.get_context(), context());
         }).on_completion([&] {
             // TODO: the reset method is not thread-safe. Ensure the correct behavior.
             cancellation_status = context().is_group_execution_cancelled();
@@ -518,7 +490,7 @@ class task_group_base : no_copy {
 
         bool cancellation_status = false;
         try_call([&] {
-            execute_and_wait(*acs::release(h), context(), m_wait_ctx, context());
+            execute_and_wait(*acs::release(h), context(), m_wait_vertex.get_context(), context());
         }).on_completion([&] {
             // TODO: the reset method is not thread-safe. Ensure the correct behavior.
             cancellation_status = context().is_group_execution_cancelled();
@@ -528,39 +500,39 @@ class task_group_base : no_copy {
     }
 
     template<typename F>
-    task* prepare_task(F&& f) {
-        m_wait_ctx.reserve();
-        small_object_allocator alloc{};
-        return alloc.new_object<function_task<typename std::decay<F>::type>>(std::forward<F>(f), m_wait_ctx, alloc);
+    d1::task* prepare_task(F&& f) {
+        d1::small_object_allocator alloc{};
+        return alloc.new_object<function_task<typename std::decay<F>::type>>(std::forward<F>(f),
+            r1::get_thread_reference_vertex(&m_wait_vertex), context(), alloc);
     }
 
-    task_group_context& context() noexcept {
+    d1::task_group_context& context() noexcept {
         return m_context.actual_context();
     }
 
     template<typename F>
     d2::task_handle prepare_task_handle(F&& f) {
-        m_wait_ctx.reserve();
-        small_object_allocator alloc{};
+        d1::small_object_allocator alloc{};
         using function_task_t =  d2::function_task<typename std::decay<F>::type>;
-        d2::task_handle_task* function_task_p =  alloc.new_object<function_task_t>(std::forward<F>(f), m_wait_ctx, context(), alloc);
+        d2::task_handle_task* function_task_p =  alloc.new_object<function_task_t>(std::forward<F>(f),
+            r1::get_thread_reference_vertex(&m_wait_vertex), context(), alloc);
 
         return d2::task_handle_accessor::construct(function_task_p);
     }
 
 public:
     task_group_base(uintptr_t traits = 0)
-        : m_wait_ctx(0)
-        , m_context(task_group_context::bound, task_group_context::default_traits | traits)
+        : m_wait_vertex(0)
+        , m_context(d1::task_group_context::bound, d1::task_group_context::default_traits | traits)
     {}
 
-    task_group_base(task_group_context& ctx)
-        : m_wait_ctx(0)
+    task_group_base(d1::task_group_context& ctx)
+        : m_wait_vertex(0)
         , m_context(&ctx)
     {}
 
     ~task_group_base() noexcept(false) {
-        if (m_wait_ctx.continue_execution()) {
+        if (m_wait_vertex.continue_execution()) {
 #if __TBB_CPP17_UNCAUGHT_EXCEPTIONS_PRESENT
             bool stack_unwinding_in_progress = std::uncaught_exceptions() > 0;
 #else
@@ -570,7 +542,7 @@ class task_group_base : no_copy {
             // in case of missing wait (for the sake of better testability & debuggability)
             if (!context().is_group_execution_cancelled())
                 cancel();
-            d1::wait(m_wait_ctx, context());
+            d1::wait(m_wait_vertex.get_context(), context());
             if (!stack_unwinding_in_progress)
                 throw_exception(exception_id::missing_wait);
         }
@@ -579,7 +551,7 @@ class task_group_base : no_copy {
     task_group_status wait() {
         bool cancellation_status = false;
         try_call([&] {
-            d1::wait(m_wait_ctx, context());
+            d1::wait(m_wait_vertex.get_context(), context());
         }).on_completion([&] {
             // TODO: the reset method is not thread-safe. Ensure the correct behavior.
             cancellation_status = m_context.is_group_execution_cancelled();
@@ -595,12 +567,12 @@ class task_group_base : no_copy {
 
 class task_group : public task_group_base {
 public:
-    task_group() : task_group_base(task_group_context::concurrent_wait) {}
-    task_group(task_group_context& ctx) : task_group_base(ctx) {}
+    task_group() : task_group_base(d1::task_group_context::concurrent_wait) {}
+    task_group(d1::task_group_context& ctx) : task_group_base(ctx) {}
 
     template<typename F>
     void run(F&& f) {
-        spawn(*prepare_task(std::forward<F>(f)), context());
+        d1::spawn(*prepare_task(std::forward<F>(f)), context());
     }
 
     void run(d2::task_handle&& h) {
@@ -609,7 +581,7 @@ class task_group : public task_group_base {
         using acs = d2::task_handle_accessor;
         __TBB_ASSERT(&acs::ctx_of(h) == &context(), "Attempt to schedule task_handle into different task_group");
 
-        spawn(*acs::release(h), context());
+        d1::spawn(*acs::release(h), context());
     }
 
     template<typename F>
@@ -629,20 +601,20 @@ class task_group : public task_group_base {
 }; // class task_group
 
 #if TBB_PREVIEW_ISOLATED_TASK_GROUP
-class spawn_delegate : public delegate_base {
-    task* task_to_spawn;
-    task_group_context& context;
+class spawn_delegate : public d1::delegate_base {
+    d1::task* task_to_spawn;
+    d1::task_group_context& context;
     bool operator()() const override {
         spawn(*task_to_spawn, context);
         return true;
     }
 public:
-    spawn_delegate(task* a_task, task_group_context& ctx)
+    spawn_delegate(d1::task* a_task, d1::task_group_context& ctx)
         : task_to_spawn(a_task), context(ctx)
     {}
 };
 
-class wait_delegate : public delegate_base {
+class wait_delegate : public d1::delegate_base {
     bool operator()() const override {
         status = tg.wait();
         return true;
@@ -674,7 +646,7 @@ class isolated_task_group : public task_group {
 public:
     isolated_task_group() : task_group() {}
 
-    isolated_task_group(task_group_context& ctx) : task_group(ctx) {}
+    isolated_task_group(d1::task_group_context& ctx) : task_group(ctx) {}
 
     template<typename F>
     void run(F&& f) {
@@ -710,26 +682,20 @@ class isolated_task_group : public task_group {
     }
 }; // class isolated_task_group
 #endif // TBB_PREVIEW_ISOLATED_TASK_GROUP
-
-inline bool is_current_task_group_canceling() {
-    task_group_context* ctx = current_context();
-    return ctx ? ctx->is_group_execution_cancelled() : false;
-}
-
-} // namespace d1
+} // namespace d2
 } // namespace detail
 
 inline namespace v1 {
 using detail::d1::task_group_context;
-using detail::d1::task_group;
+using detail::d2::task_group;
 #if TBB_PREVIEW_ISOLATED_TASK_GROUP
-using detail::d1::isolated_task_group;
+using detail::d2::isolated_task_group;
 #endif
 
-using detail::d1::task_group_status;
-using detail::d1::not_complete;
-using detail::d1::complete;
-using detail::d1::canceled;
+using detail::d2::task_group_status;
+using detail::d2::not_complete;
+using detail::d2::complete;
+using detail::d2::canceled;
 
 using detail::d1::is_current_task_group_canceling;
 using detail::r1::missing_wait;
diff --git a/third-party/tbb/include/oneapi/tbb/version.h b/third-party/tbb/include/oneapi/tbb/version.h
index fff3e7e2f9..c8f3ad50e3 100644
--- a/third-party/tbb/include/oneapi/tbb/version.h
+++ b/third-party/tbb/include/oneapi/tbb/version.h
@@ -27,9 +27,9 @@
 #endif
 
 // Product version
-#define TBB_VERSION_MAJOR 2021
+#define TBB_VERSION_MAJOR 2022
 // Update version
-#define TBB_VERSION_MINOR 13
+#define TBB_VERSION_MINOR 0
 // "Patch" version for custom releases
 #define TBB_VERSION_PATCH 0
 // Suffix string
@@ -44,7 +44,7 @@
 // OneAPI oneTBB specification version
 #define ONETBB_SPEC_VERSION "1.0"
 // Full interface version
-#define TBB_INTERFACE_VERSION 12130
+#define TBB_INTERFACE_VERSION 12140
 // Major interface version
 #define TBB_INTERFACE_VERSION_MAJOR (TBB_INTERFACE_VERSION/1000)
 // Minor interface version
diff --git a/third-party/tbb/src/tbb/CMakeLists.txt b/third-party/tbb/src/tbb/CMakeLists.txt
index b996c736a7..8c84a0b29b 100644
--- a/third-party/tbb/src/tbb/CMakeLists.txt
+++ b/third-party/tbb/src/tbb/CMakeLists.txt
@@ -126,6 +126,25 @@ target_link_libraries(tbb
     ${TBB_COMMON_LINK_LIBS}
 )
 
+# Strip debug symbols into a separate .dbg file
+if(TBB_LINUX_SEPARATE_DBG)
+    if(NOT CMAKE_BUILD_TYPE STREQUAL "release")
+        find_program(OBJCOPY_COMMAND objcopy)
+        if(NOT OBJCOPY_COMMAND)
+            message(WARNING "objcopy command not found in the system")
+        else()
+            add_custom_command(TARGET tbb POST_BUILD
+                COMMAND objcopy --only-keep-debug $<TARGET_FILE:tbb> $<TARGET_FILE:tbb>.dbg
+                COMMAND objcopy --strip-debug $<TARGET_FILE:tbb>
+                COMMAND objcopy --add-gnu-debuglink=$<TARGET_FILE:tbb>.dbg $<TARGET_FILE:tbb>
+                COMMENT "Creating and associating .dbg file with tbb"
+            )
+        endif()
+    else()
+        message(WARNING " TBB_LINUX_SEPARATE_DBG flag is not used on release config")
+    endif()
+endif()
+
 if(TBB_BUILD_APPLE_FRAMEWORKS)
     set_target_properties(tbb PROPERTIES
         FRAMEWORK TRUE
@@ -158,7 +177,13 @@ if (TBB_INSTALL)
                 COMPONENT devel
         )
     endif()
-
+    if(TBB_LINUX_SEPARATE_DBG)
+        install(FILES
+                $<TARGET_FILE:tbb>.dbg
+                DESTINATION lib
+                COMPONENT devel
+        )
+    endif()
     set(_tbb_pc_lib_name tbb)
 
     if (WIN32)
diff --git a/third-party/tbb/src/tbb/allocator.cpp b/third-party/tbb/src/tbb/allocator.cpp
index 888f43fd33..689c51255d 100644
--- a/third-party/tbb/src/tbb/allocator.cpp
+++ b/third-party/tbb/src/tbb/allocator.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -157,6 +157,14 @@ void initialize_cache_aligned_allocator() {
 }
 
 //! Executed on very first call through allocate_handler
+/** Only one of initialize_allocate_handler() and initialize_cache_aligned_allocate_handler()
+    is called, since each one of them also initializes the other.
+
+    In the current implementation of oneTBB library initialization, cache_aligned_allocate() is
+    used, which in turn calls initialize_cache_aligned_allocate_handler(). As mentioned above,
+    that also initializes the regular allocate_handler.
+
+    Therefore, initialize_allocate_handler() is not called in the current library implementation. */
 static void* initialize_allocate_handler(std::size_t size) {
     initialize_cache_aligned_allocator();
     __TBB_ASSERT(allocate_handler != &initialize_allocate_handler, nullptr);
diff --git a/third-party/tbb/src/tbb/arena.cpp b/third-party/tbb/src/tbb/arena.cpp
index 0e7cf43c3b..6ca062d02f 100644
--- a/third-party/tbb/src/tbb/arena.cpp
+++ b/third-party/tbb/src/tbb/arena.cpp
@@ -195,8 +195,6 @@ void arena::process(thread_data& tls) {
         return;
     }
 
-    my_tc_client.get_pm_client()->register_thread();
-
     __TBB_ASSERT( index >= my_num_reserved_slots, "Workers cannot occupy reserved slots" );
     tls.attach_arena(*this, index);
     // worker thread enters the dispatch loop to look for a work
@@ -236,8 +234,6 @@ void arena::process(thread_data& tls) {
     __TBB_ASSERT(tls.my_inbox.is_idle_state(true), nullptr);
     __TBB_ASSERT(is_alive(my_guard), nullptr);
 
-    my_tc_client.get_pm_client()->unregister_thread();
-
     // In contrast to earlier versions of TBB (before 3.0 U5) now it is possible
     // that arena may be temporarily left unpopulated by threads. See comments in
     // arena::on_thread_leaving() for more details.
@@ -503,6 +499,7 @@ struct task_arena_impl {
     static void wait(d1::task_arena_base&);
     static int max_concurrency(const d1::task_arena_base*);
     static void enqueue(d1::task&, d1::task_group_context*, d1::task_arena_base*);
+    static d1::slot_id execution_slot(const d1::task_arena_base&);
 };
 
 void __TBB_EXPORTED_FUNC initialize(d1::task_arena_base& ta) {
@@ -533,6 +530,10 @@ void __TBB_EXPORTED_FUNC enqueue(d1::task& t, d1::task_group_context& ctx, d1::t
     task_arena_impl::enqueue(t, &ctx, ta);
 }
 
+d1::slot_id __TBB_EXPORTED_FUNC execution_slot(const d1::task_arena_base& arena) {
+    return task_arena_impl::execution_slot(arena);
+}
+
 void task_arena_impl::initialize(d1::task_arena_base& ta) {
     // Enforce global market initialization to properly initialize soft limit
     (void)governor::get_thread_data();
@@ -559,7 +560,7 @@ void task_arena_impl::initialize(d1::task_arena_base& ta) {
         ta.my_numa_id, ta.core_type(), ta.max_threads_per_core());
     if (observer) {
         // TODO: Consider lazy initialization for internal arena so
-        // the direct calls to observer might be omitted until actual initialization. 
+        // the direct calls to observer might be omitted until actual initialization.
         observer->on_scheduler_entry(true);
     }
 #endif /*__TBB_CPUBIND_PRESENT*/
@@ -624,6 +625,14 @@ void task_arena_impl::enqueue(d1::task& t, d1::task_group_context* c, d1::task_a
      a->enqueue_task(t, *ctx, *td);
 }
 
+d1::slot_id task_arena_impl::execution_slot(const d1::task_arena_base& ta) {
+    thread_data* td = governor::get_thread_data_if_initialized();
+    if (td && (td->is_attached_to(ta.my_arena.load(std::memory_order_relaxed)))) {
+        return td->my_arena_index;
+    }
+    return d1::slot_id(-1);
+}
+
 class nested_arena_context : no_copy {
 public:
     nested_arena_context(thread_data& td, arena& nested_arena, std::size_t slot_index)
@@ -633,9 +642,11 @@ class nested_arena_context : no_copy {
             m_orig_arena = td.my_arena;
             m_orig_slot_index = td.my_arena_index;
             m_orig_last_observer = td.my_last_observer;
+            m_orig_is_thread_registered = td.my_is_registered;
 
             td.detach_task_dispatcher();
             td.attach_arena(nested_arena, slot_index);
+            td.my_is_registered = false;
             if (td.my_inbox.is_idle_state(true))
                 td.my_inbox.set_is_idle(false);
             task_dispatcher& task_disp = td.my_arena_slot->default_task_dispatcher();
@@ -686,7 +697,7 @@ class nested_arena_context : no_copy {
             td.leave_task_dispatcher();
             td.my_arena_slot->release();
             td.my_arena->my_exit_monitors.notify_one(); // do not relax!
-
+            td.my_is_registered = m_orig_is_thread_registered;
             td.attach_arena(*m_orig_arena, m_orig_slot_index);
             td.attach_task_dispatcher(*m_orig_execute_data_ext.task_disp);
             __TBB_ASSERT(td.my_inbox.is_idle_state(false), nullptr);
@@ -702,6 +713,7 @@ class nested_arena_context : no_copy {
     unsigned            m_orig_slot_index{};
     bool                m_orig_fifo_tasks_allowed{};
     bool                m_orig_critical_task_allowed{};
+    bool                m_orig_is_thread_registered{};
 };
 
 class delegated_task : public d1::task {
diff --git a/third-party/tbb/src/tbb/def/lin32-tbb.def b/third-party/tbb/src/tbb/def/lin32-tbb.def
index ec03c3aa5c..737e8ec2af 100644
--- a/third-party/tbb/src/tbb/def/lin32-tbb.def
+++ b/third-party/tbb/src/tbb/def/lin32-tbb.def
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -77,6 +77,7 @@ _ZN3tbb6detail2r17suspendEPFvPvPNS1_18suspend_point_typeEES2_;
 _ZN3tbb6detail2r16resumeEPNS1_18suspend_point_typeE;
 _ZN3tbb6detail2r121current_suspend_pointEv;
 _ZN3tbb6detail2r114notify_waitersEj;
+_ZN3tbb6detail2r127get_thread_reference_vertexEPNS0_2d126wait_tree_vertex_interfaceE;
 
 /* Task dispatcher (task_dispatcher.cpp) */
 _ZN3tbb6detail2r114execution_slotEPKNS0_2d114execution_dataE;
@@ -105,6 +106,7 @@ _ZN3tbb6detail2r120isolate_within_arenaERNS0_2d113delegate_baseEi;
 _ZN3tbb6detail2r17enqueueERNS0_2d14taskEPNS2_15task_arena_baseE;
 _ZN3tbb6detail2r17enqueueERNS0_2d14taskERNS2_18task_group_contextEPNS2_15task_arena_baseE;
 _ZN3tbb6detail2r14waitERNS0_2d115task_arena_baseE;
+_ZN3tbb6detail2r114execution_slotERKNS0_2d115task_arena_baseE;
 
 /* System topology parsing and threads pinning (governor.cpp) */
 _ZN3tbb6detail2r115numa_node_countEv;
@@ -159,4 +161,3 @@ local:
 /* TODO: fill more precisely */
 *;
 };
-
diff --git a/third-party/tbb/src/tbb/def/lin64-tbb.def b/third-party/tbb/src/tbb/def/lin64-tbb.def
index 119eea1348..41aca2e932 100644
--- a/third-party/tbb/src/tbb/def/lin64-tbb.def
+++ b/third-party/tbb/src/tbb/def/lin64-tbb.def
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -77,6 +77,7 @@ _ZN3tbb6detail2r17suspendEPFvPvPNS1_18suspend_point_typeEES2_;
 _ZN3tbb6detail2r16resumeEPNS1_18suspend_point_typeE;
 _ZN3tbb6detail2r121current_suspend_pointEv;
 _ZN3tbb6detail2r114notify_waitersEm;
+_ZN3tbb6detail2r127get_thread_reference_vertexEPNS0_2d126wait_tree_vertex_interfaceE;
 
 /* Task dispatcher (task_dispatcher.cpp) */
 _ZN3tbb6detail2r114execution_slotEPKNS0_2d114execution_dataE;
@@ -105,6 +106,7 @@ _ZN3tbb6detail2r120isolate_within_arenaERNS0_2d113delegate_baseEl;
 _ZN3tbb6detail2r17enqueueERNS0_2d14taskEPNS2_15task_arena_baseE;
 _ZN3tbb6detail2r17enqueueERNS0_2d14taskERNS2_18task_group_contextEPNS2_15task_arena_baseE;
 _ZN3tbb6detail2r14waitERNS0_2d115task_arena_baseE;
+_ZN3tbb6detail2r114execution_slotERKNS0_2d115task_arena_baseE;
 
 /* System topology parsing and threads pinning (governor.cpp) */
 _ZN3tbb6detail2r115numa_node_countEv;
diff --git a/third-party/tbb/src/tbb/def/mac64-tbb.def b/third-party/tbb/src/tbb/def/mac64-tbb.def
index fcccd7b858..38bc48d30e 100644
--- a/third-party/tbb/src/tbb/def/mac64-tbb.def
+++ b/third-party/tbb/src/tbb/def/mac64-tbb.def
@@ -1,4 +1,4 @@
-# Copyright (c) 2005-2021 Intel Corporation
+# Copyright (c) 2005-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -79,6 +79,7 @@ __ZN3tbb6detail2r17suspendEPFvPvPNS1_18suspend_point_typeEES2_
 __ZN3tbb6detail2r16resumeEPNS1_18suspend_point_typeE
 __ZN3tbb6detail2r121current_suspend_pointEv
 __ZN3tbb6detail2r114notify_waitersEm
+__ZN3tbb6detail2r127get_thread_reference_vertexEPNS0_2d126wait_tree_vertex_interfaceE
 
 # Task dispatcher (task_dispatcher.cpp)
 __ZN3tbb6detail2r114execution_slotEPKNS0_2d114execution_dataE
@@ -107,6 +108,7 @@ __ZN3tbb6detail2r120isolate_within_arenaERNS0_2d113delegate_baseEl
 __ZN3tbb6detail2r17enqueueERNS0_2d14taskEPNS2_15task_arena_baseE
 __ZN3tbb6detail2r17enqueueERNS0_2d14taskERNS2_18task_group_contextEPNS2_15task_arena_baseE
 __ZN3tbb6detail2r14waitERNS0_2d115task_arena_baseE
+__ZN3tbb6detail2r114execution_slotERKNS0_2d115task_arena_baseE
 
 # System topology parsing and threads pinning (governor.cpp)
 __ZN3tbb6detail2r115numa_node_countEv
@@ -156,4 +158,3 @@ __ZN3tbb6detail2r121notify_by_address_allEPv
 # Versioning (version.cpp)
 _TBB_runtime_interface_version
 _TBB_runtime_version
-
diff --git a/third-party/tbb/src/tbb/def/win32-tbb.def b/third-party/tbb/src/tbb/def/win32-tbb.def
index 6863914028..94b5441701 100644
--- a/third-party/tbb/src/tbb/def/win32-tbb.def
+++ b/third-party/tbb/src/tbb/def/win32-tbb.def
@@ -1,4 +1,4 @@
-; Copyright (c) 2005-2021 Intel Corporation
+; Copyright (c) 2005-2024 Intel Corporation
 ;
 ; Licensed under the Apache License, Version 2.0 (the "License");
 ; you may not use this file except in compliance with the License.
@@ -71,6 +71,7 @@ EXPORTS
 ?resume@r1@detail@tbb@@YAXPAUsuspend_point_type@123@@Z
 ?suspend@r1@detail@tbb@@YAXP6AXPAXPAUsuspend_point_type@123@@Z0@Z
 ?notify_waiters@r1@detail@tbb@@YAXI@Z
+?get_thread_reference_vertex@r1@detail@tbb@@YAPAVwait_tree_vertex_interface@d1@23@PAV4523@@Z
 
 ; Task dispatcher (task_dispatcher.cpp)
 ?spawn@r1@detail@tbb@@YAXAAVtask@d1@23@AAVtask_group_context@523@G@Z
@@ -99,6 +100,7 @@ EXPORTS
 ?terminate@r1@detail@tbb@@YAXAAVtask_arena_base@d1@23@@Z
 ?wait@r1@detail@tbb@@YAXAAVtask_arena_base@d1@23@@Z
 ?enqueue@r1@detail@tbb@@YAXAAVtask@d1@23@AAVtask_group_context@523@PAVtask_arena_base@523@@Z
+?execution_slot@r1@detail@tbb@@YAGABVtask_arena_base@d1@23@@Z
 
 ; System topology parsing and threads pinning (governor.cpp)
 ?numa_node_count@r1@detail@tbb@@YAIXZ
diff --git a/third-party/tbb/src/tbb/def/win64-tbb.def b/third-party/tbb/src/tbb/def/win64-tbb.def
index 306903c129..96bafc0163 100644
--- a/third-party/tbb/src/tbb/def/win64-tbb.def
+++ b/third-party/tbb/src/tbb/def/win64-tbb.def
@@ -1,4 +1,4 @@
-; Copyright (c) 2005-2021 Intel Corporation
+; Copyright (c) 2005-2024 Intel Corporation
 ;
 ; Licensed under the Apache License, Version 2.0 (the "License");
 ; you may not use this file except in compliance with the License.
@@ -71,6 +71,7 @@ EXPORTS
 ?resume@r1@detail@tbb@@YAXPEAUsuspend_point_type@123@@Z
 ?current_suspend_point@r1@detail@tbb@@YAPEAUsuspend_point_type@123@XZ
 ?notify_waiters@r1@detail@tbb@@YAX_K@Z
+?get_thread_reference_vertex@r1@detail@tbb@@YAPEAVwait_tree_vertex_interface@d1@23@PEAV4523@@Z
 
 ; Task dispatcher (task_dispatcher.cpp)
 ?spawn@r1@detail@tbb@@YAXAEAVtask@d1@23@AEAVtask_group_context@523@@Z
@@ -99,6 +100,7 @@ EXPORTS
 ?isolate_within_arena@r1@detail@tbb@@YAXAEAVdelegate_base@d1@23@_J@Z
 ?enqueue@r1@detail@tbb@@YAXAEAVtask@d1@23@PEAVtask_arena_base@523@@Z
 ?enqueue@r1@detail@tbb@@YAXAEAVtask@d1@23@AEAVtask_group_context@523@PEAVtask_arena_base@523@@Z
+?execution_slot@r1@detail@tbb@@YAGAEBVtask_arena_base@d1@23@@Z
 
 ; System topology parsing and threads pinning (governor.cpp)
 ?numa_node_count@r1@detail@tbb@@YAIXZ
diff --git a/third-party/tbb/src/tbb/global_control.cpp b/third-party/tbb/src/tbb/global_control.cpp
index 127fc92db3..f45c66b87f 100644
--- a/third-party/tbb/src/tbb/global_control.cpp
+++ b/third-party/tbb/src/tbb/global_control.cpp
@@ -104,6 +104,8 @@ class alignas(max_nfs_size) stack_size_control : public control_storage {
             return hi - lo;
         }();
         return ThreadStackSizeDefault;
+#elif defined(EMSCRIPTEN)
+        return __TBB_EMSCRIPTEN_STACK_SIZE;
 #else
         return ThreadStackSize;
 #endif
diff --git a/third-party/tbb/src/tbb/governor.cpp b/third-party/tbb/src/tbb/governor.cpp
index 55175196b2..218a2bc533 100644
--- a/third-party/tbb/src/tbb/governor.cpp
+++ b/third-party/tbb/src/tbb/governor.cpp
@@ -37,10 +37,18 @@
 #include <atomic>
 #include <algorithm>
 
+#ifdef EMSCRIPTEN
+#include <emscripten/stack.h>
+#endif
+
 namespace tbb {
 namespace detail {
 namespace r1 {
 
+#if TBB_USE_ASSERT
+std::atomic<int> the_observer_proxy_count;
+#endif /* TBB_USE_ASSERT */
+
 void clear_address_waiter_table();
 void global_control_acquire();
 void global_control_release();
@@ -86,6 +94,12 @@ void governor::release_resources () {
         runtime_warning("failed to destroy task scheduler TLS: %s", std::strerror(status));
     clear_address_waiter_table();
 
+#if TBB_USE_ASSERT
+    if (the_observer_proxy_count != 0) {
+            runtime_warning("Leaked %ld observer_proxy objects\n", long(the_observer_proxy_count));
+    }
+#endif /* TBB_USE_ASSERT */
+
     system_topology::destroy();
     dynamic_unlink_all();
     global_control_release();
@@ -145,6 +159,9 @@ static std::uintptr_t get_stack_base(std::size_t stack_size) {
     NT_TIB* pteb = (NT_TIB*)NtCurrentTeb();
     __TBB_ASSERT(&pteb < pteb->StackBase && &pteb > pteb->StackLimit, "invalid stack info in TEB");
     return reinterpret_cast<std::uintptr_t>(pteb->StackBase);
+#elif defined(EMSCRIPTEN)
+    suppress_unused_warning(stack_size);
+    return reinterpret_cast<std::uintptr_t>(emscripten_stack_get_base());
 #else
     // There is no portable way to get stack base address in Posix, so we use
     // non-portable method (on all modern Linux) or the simplified approach
diff --git a/third-party/tbb/src/tbb/main.cpp b/third-party/tbb/src/tbb/main.cpp
index 85e759e2f6..f43c33f5b7 100644
--- a/third-party/tbb/src/tbb/main.cpp
+++ b/third-party/tbb/src/tbb/main.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -72,21 +72,6 @@ void ITT_DoUnsafeOneTimeInitialization();
 static __TBB_InitOnce __TBB_InitOnceHiddenInstance;
 #endif
 
-#if TBB_USE_ASSERT
-std::atomic<int> the_observer_proxy_count;
-
-struct check_observer_proxy_count {
-    ~check_observer_proxy_count() {
-        if (the_observer_proxy_count != 0) {
-            runtime_warning("Leaked %ld observer_proxy objects\n", long(the_observer_proxy_count));
-        }
-    }
-};
-// The proxy count checker shall be defined after __TBB_InitOnceHiddenInstance to check the count
-// after auto termination.
-static check_observer_proxy_count the_check_observer_proxy_count;
-#endif /* TBB_USE_ASSERT */
-
 //------------------------------------------------------------------------
 // __TBB_InitOnce
 //------------------------------------------------------------------------
diff --git a/third-party/tbb/src/tbb/scheduler_common.h b/third-party/tbb/src/tbb/scheduler_common.h
index f9e8a68d37..e4686e1673 100644
--- a/third-party/tbb/src/tbb/scheduler_common.h
+++ b/third-party/tbb/src/tbb/scheduler_common.h
@@ -23,6 +23,7 @@
 #include "oneapi/tbb/detail/_machine.h"
 #include "oneapi/tbb/task_group.h"
 #include "oneapi/tbb/cache_aligned_allocator.h"
+#include "oneapi/tbb/tbb_allocator.h"
 #include "itt_notify.h"
 #include "co_context.h"
 #include "misc.h"
@@ -42,6 +43,7 @@
 #include <cstdint>
 #include <exception>
 #include <memory> // unique_ptr
+#include <unordered_map>
 
 //! Mutex type for global locks in the scheduler
 using scheduler_mutex_type = __TBB_SCHEDULER_MUTEX_TYPE;
@@ -395,7 +397,7 @@ struct suspend_point_type {
 
     void finilize_resume() {
         m_stack_state.store(stack_state::active, std::memory_order_relaxed);
-        // Set the suspended state for the stack that we left. If the state is already notified, it means that 
+        // Set the suspended state for the stack that we left. If the state is already notified, it means that
         // someone already tried to resume our previous stack but failed. So, we need to resume it.
         // m_prev_suspend_point might be nullptr when destroying co_context based on threads
         if (m_prev_suspend_point && m_prev_suspend_point->m_stack_state.exchange(stack_state::suspended) == stack_state::notified) {
@@ -474,6 +476,13 @@ class alignas (max_nfs_size) task_dispatcher {
     //! Suspend point (null if this task dispatcher has been never suspended)
     suspend_point_type* m_suspend_point{ nullptr };
 
+    //! Used to improve scalability of d1::wait_context by using per thread reference_counter
+    std::unordered_map<d1::wait_tree_vertex_interface*, d1::reference_vertex*,
+                       std::hash<d1::wait_tree_vertex_interface*>, std::equal_to<d1::wait_tree_vertex_interface*>,
+                       tbb_allocator<std::pair<d1::wait_tree_vertex_interface* const, d1::reference_vertex*>>
+                      >
+        m_reference_vertex_map;
+
     //! Attempt to get a task from the mailbox.
     /** Gets a task only if it has not been executed by its sender or a thief
         that has stolen it from the sender's task pool. Otherwise returns nullptr.
@@ -502,6 +511,14 @@ class alignas (max_nfs_size) task_dispatcher {
             m_suspend_point->~suspend_point_type();
             cache_aligned_deallocate(m_suspend_point);
         }
+
+        for (auto& elem : m_reference_vertex_map) {
+            d1::reference_vertex*& node = elem.second;
+            node->~reference_vertex();
+            cache_aligned_deallocate(node);
+            poison_pointer(node);
+        }
+
         poison_pointer(m_thread_data);
         poison_pointer(m_suspend_point);
     }
diff --git a/third-party/tbb/src/tbb/task.cpp b/third-party/tbb/src/tbb/task.cpp
index 08463bf398..84b4278f0a 100644
--- a/third-party/tbb/src/tbb/task.cpp
+++ b/third-party/tbb/src/tbb/task.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -221,7 +221,37 @@ void notify_waiters(std::uintptr_t wait_ctx_addr) {
     governor::get_thread_data()->my_arena->get_waiting_threads_monitor().notify(is_related_wait_ctx);
 }
 
+d1::wait_tree_vertex_interface* get_thread_reference_vertex(d1::wait_tree_vertex_interface* top_wait_context) {
+    __TBB_ASSERT(top_wait_context, nullptr);
+    auto& dispatcher = *governor::get_thread_data()->my_task_dispatcher;
+
+    d1::reference_vertex* ref_counter{nullptr};
+    auto& reference_map = dispatcher.m_reference_vertex_map;
+    auto pos = reference_map.find(top_wait_context);
+    if (pos != reference_map.end()) {
+        ref_counter = pos->second;
+    } else {
+        constexpr std::size_t max_reference_vertex_map_size = 1000;
+        if (reference_map.size() > max_reference_vertex_map_size) {
+            // TODO: Research the possibility of using better approach for a clean-up
+            for (auto it = reference_map.begin(); it != reference_map.end();) {
+                if (it->second->get_num_child() == 0) {
+                    it->second->~reference_vertex();
+                    cache_aligned_deallocate(it->second);
+                    it = reference_map.erase(it);
+                } else {
+                    ++it;
+                }
+            }
+        }
+
+        reference_map[top_wait_context] = ref_counter =
+            new (cache_aligned_allocate(sizeof(d1::reference_vertex))) d1::reference_vertex(top_wait_context, 0);
+    }
+
+    return ref_counter;
+}
+
 } // namespace r1
 } // namespace detail
 } // namespace tbb
-
diff --git a/third-party/tbb/src/tbb/task_dispatcher.h b/third-party/tbb/src/tbb/task_dispatcher.h
index 20c7c731a7..c818934e5a 100644
--- a/third-party/tbb/src/tbb/task_dispatcher.h
+++ b/third-party/tbb/src/tbb/task_dispatcher.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2020-2023 Intel Corporation
+    Copyright (c) 2020-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -249,15 +249,21 @@ d1::task* task_dispatcher::local_wait_for_all(d1::task* t, Waiter& waiter ) {
         task_dispatcher& task_disp;
         execution_data_ext old_execute_data_ext;
         properties old_properties;
+        bool is_initially_registered;
 
         ~dispatch_loop_guard() {
             task_disp.m_execute_data_ext = old_execute_data_ext;
             task_disp.m_properties = old_properties;
 
+            if (!is_initially_registered) {
+                task_disp.m_thread_data->my_arena->my_tc_client.get_pm_client()->unregister_thread();
+                task_disp.m_thread_data->my_is_registered = false;
+            }
+
             __TBB_ASSERT(task_disp.m_thread_data && governor::is_thread_data_set(task_disp.m_thread_data), nullptr);
             __TBB_ASSERT(task_disp.m_thread_data->my_task_dispatcher == &task_disp, nullptr);
         }
-    } dl_guard{ *this, m_execute_data_ext, m_properties };
+    } dl_guard{ *this, m_execute_data_ext, m_properties, m_thread_data->my_is_registered };
 
     // The context guard to track fp setting and itt tasks.
     context_guard_helper</*report_tasks=*/ITTPossible> context_guard;
@@ -282,6 +288,11 @@ d1::task* task_dispatcher::local_wait_for_all(d1::task* t, Waiter& waiter ) {
     m_properties.outermost = false;
     m_properties.fifo_tasks_allowed = false;
 
+    if (!dl_guard.is_initially_registered) {
+        m_thread_data->my_arena->my_tc_client.get_pm_client()->register_thread();
+        m_thread_data->my_is_registered = true;
+    }
+
     t = get_critical_task(t, ed, isolation, critical_allowed);
     if (t && m_thread_data->my_inbox.is_idle_state(true)) {
         // The thread has a work to do. Therefore, marking its inbox as not idle so that
diff --git a/third-party/tbb/src/tbb/tcm.h b/third-party/tbb/src/tbb/tcm.h
index 05fe0434eb..66ee18a2f0 100644
--- a/third-party/tbb/src/tbb/tcm.h
+++ b/third-party/tbb/src/tbb/tcm.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2023 Intel Corporation
+    Copyright (c) 2023-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -50,7 +50,8 @@ typedef struct _tcm_permit_flags_t {
   uint32_t stale : 1;
   uint32_t rigid_concurrency : 1;
   uint32_t exclusive : 1;
-  uint32_t reserved : 29;
+  uint32_t request_as_inactive : 1;
+  uint32_t reserved : 28;
 } tcm_permit_flags_t;
 
 typedef struct _tcm_callback_flags_t {
diff --git a/third-party/tbb/src/tbb/tcm_adaptor.cpp b/third-party/tbb/src/tbb/tcm_adaptor.cpp
index e20ebb831d..85ca125b4e 100644
--- a/third-party/tbb/src/tbb/tcm_adaptor.cpp
+++ b/third-party/tbb/src/tbb/tcm_adaptor.cpp
@@ -170,7 +170,7 @@ class tcm_client : public pm_client {
         __TBB_ASSERT_EX(res == TCM_RESULT_SUCCESS, nullptr);
     }
 
-    void init(d1::constraints& constraints) {
+    void init(tcm_client_id_t client_id, d1::constraints& constraints) {
         __TBB_ASSERT(tcm_request_permit, nullptr);
         __TBB_ASSERT(tcm_deactivate_permit, nullptr);
 
@@ -190,6 +190,12 @@ class tcm_client : public pm_client {
 
         my_permit_request.min_sw_threads = 0;
         my_permit_request.max_sw_threads = 0;
+        my_permit_request.flags.request_as_inactive = 1;
+
+        tcm_result_t res = tcm_request_permit(client_id, my_permit_request, this, &my_permit_handle, nullptr);
+        __TBB_ASSERT_EX(res == TCM_RESULT_SUCCESS, nullptr);
+
+        my_permit_request.flags.request_as_inactive = 0;
     }
 
     void register_thread() override {
@@ -279,7 +285,7 @@ pm_client* tcm_adaptor::create_client(arena& a) {
 }
 
 void tcm_adaptor::register_client(pm_client* c, d1::constraints& constraints) {
-    static_cast<tcm_client*>(c)->init(constraints);
+    static_cast<tcm_client*>(c)->init(my_impl->client_id, constraints);
 }
 
 void tcm_adaptor::unregister_and_destroy_client(pm_client& c) {
diff --git a/third-party/tbb/src/tbb/thread_data.h b/third-party/tbb/src/tbb/thread_data.h
index 9dfa492a72..422ec694ec 100644
--- a/third-party/tbb/src/tbb/thread_data.h
+++ b/third-party/tbb/src/tbb/thread_data.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2020-2023 Intel Corporation
+    Copyright (c) 2020-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -101,6 +101,7 @@ class thread_data : public ::rml::job
     thread_data(unsigned short index, bool is_worker)
         : my_arena_index{ index }
         , my_is_worker{ is_worker }
+        , my_is_registered { false }
         , my_task_dispatcher{ nullptr }
         , my_arena{ nullptr }
         , my_last_client{ nullptr }
@@ -145,6 +146,8 @@ class thread_data : public ::rml::job
     //! Indicates if the thread is created by RML
     const bool my_is_worker;
 
+    bool my_is_registered;
+
     //! The current task dipsatcher
     task_dispatcher* my_task_dispatcher;
 
diff --git a/third-party/tbb/src/tbb/thread_request_serializer.cpp b/third-party/tbb/src/tbb/thread_request_serializer.cpp
index 6019f732b4..41cf51b0b0 100644
--- a/third-party/tbb/src/tbb/thread_request_serializer.cpp
+++ b/third-party/tbb/src/tbb/thread_request_serializer.cpp
@@ -100,13 +100,12 @@ void thread_request_serializer_proxy::set_active_num_workers(int soft_limit) {
 
     if (soft_limit != 0) {
         my_is_mandatory_concurrency_enabled = false;
-        my_serializer.set_active_num_workers(soft_limit);
-    } else {
-        if (my_num_mandatory_requests > 0 && !my_is_mandatory_concurrency_enabled) {
-            my_is_mandatory_concurrency_enabled = true;
-            my_serializer.set_active_num_workers(1);
-        }
+    } else if (my_num_mandatory_requests > 0) {
+        my_is_mandatory_concurrency_enabled = true;
+        soft_limit = 1;
     }
+
+    my_serializer.set_active_num_workers(soft_limit);
 }
 
 int thread_request_serializer_proxy::num_workers_requested() { return my_serializer.num_workers_requested(); }
diff --git a/third-party/tbb/src/tbb/threading_control.cpp b/third-party/tbb/src/tbb/threading_control.cpp
index 1ca1837826..7a62b337f6 100644
--- a/third-party/tbb/src/tbb/threading_control.cpp
+++ b/third-party/tbb/src/tbb/threading_control.cpp
@@ -334,7 +334,12 @@ bool threading_control::try_destroy_client(threading_control::client_snapshot de
 }
 
 void threading_control::set_active_num_workers(unsigned soft_limit) {
-    threading_control* thr_control = get_threading_control(/*public = */ false);
+    threading_control* thr_control{nullptr};
+    {
+        global_mutex_type::scoped_lock lock(g_threading_control_mutex);
+        thr_control = get_threading_control(/*public = */ false);
+    }
+
     if (thr_control != nullptr) {
         thr_control->my_pimpl->set_active_num_workers(soft_limit);
         thr_control->release(/*is_public=*/false, /*blocking_terminate=*/false);
diff --git a/third-party/tbb/src/tbbbind/tbb_bind.cpp b/third-party/tbb/src/tbbbind/tbb_bind.cpp
index 50119e4e54..bb52e11517 100644
--- a/third-party/tbb/src/tbbbind/tbb_bind.cpp
+++ b/third-party/tbb/src/tbbbind/tbb_bind.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2019-2023 Intel Corporation
+    Copyright (c) 2019-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -88,12 +88,15 @@ class system_topology {
         if ( hwloc_topology_init( &topology ) == 0 ) {
             initialization_state = topology_allocated;
 #if __TBBBIND_HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING_PRESENT
-            if ( groups_num == 1 &&
-                 hwloc_topology_set_flags(topology,
-                     HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM |
-                     HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING
-                 ) != 0
-            ) {
+            unsigned long flags = 0;
+            if (groups_num > 1) {
+                // HWLOC x86 backend might interfere with process affinity mask on
+                // Windows systems with multiple processor groups.
+                flags = HWLOC_TOPOLOGY_FLAG_DONT_CHANGE_BINDING;
+            } else {
+                flags = HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM | HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING;
+            }
+            if (hwloc_topology_set_flags(topology, flags) != 0) {
                 return;
             }
 #endif
diff --git a/third-party/tbb/src/tbbmalloc/frontend.cpp b/third-party/tbb/src/tbbmalloc/frontend.cpp
index 77f9d6594e..c9aaf46337 100644
--- a/third-party/tbb/src/tbbmalloc/frontend.cpp
+++ b/third-party/tbb/src/tbbmalloc/frontend.cpp
@@ -817,6 +817,7 @@ unsigned int getSmallObjectIndex(unsigned int size)
 /*
  * Depending on indexRequest, for a given size return either the index into the bin
  * for objects of this size, or the actual size of objects in this bin.
+ * TODO: Change return type to unsigned short.
  */
 template<bool indexRequest>
 static unsigned int getIndexOrObjectSize (unsigned int size)
@@ -1581,6 +1582,7 @@ void Block::initEmptyBlock(TLSData *tls, size_t size)
     unsigned int objSz = getObjectSize(size);
 
     cleanBlockHeader();
+    MALLOC_ASSERT(objSz <= USHRT_MAX, "objSz must not be less 2^16-1");
     objectSize = objSz;
     markOwned(tls);
     // bump pointer should be prepared for first allocation - thus mode it down to objectSize
diff --git a/third-party/tbb/src/tbbmalloc/large_objects.h b/third-party/tbb/src/tbbmalloc/large_objects.h
index 8519784267..58d7c81a7b 100644
--- a/third-party/tbb/src/tbbmalloc/large_objects.h
+++ b/third-party/tbb/src/tbbmalloc/large_objects.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -81,18 +81,25 @@ struct HugeBinStructureProps {
 
     static size_t alignToBin(size_t size) {
         MALLOC_ASSERT(size >= StepFactor, "Size must not be less than the StepFactor");
-        size_t minorStepExp = BitScanRev(size) - StepFactorExp;
+
+        int sizeExp = (int)BitScanRev(size);
+        MALLOC_ASSERT(sizeExp >= 0, "BitScanRev() cannot return -1, as size >= stepfactor > 0");
+        MALLOC_ASSERT(sizeExp >= StepFactorExp, "sizeExp >= StepFactorExp, because size >= stepFactor");
+        int minorStepExp = sizeExp - StepFactorExp;
+
         return alignUp(size, 1ULL << minorStepExp);
     }
 
     // Sizes between the power of 2 values are approximated to StepFactor.
     static int sizeToIdx(size_t size) {
         MALLOC_ASSERT(MinSize <= size && size <= MaxSize, ASSERT_TEXT);
+
         int sizeExp = (int)BitScanRev(size); // same as __TBB_Log2
-        MALLOC_ASSERT(sizeExp >= 0, "A shift amount (sizeExp) must not be negative");
-        size_t majorStepSize = 1ULL << sizeExp;
+        MALLOC_ASSERT(sizeExp >= 0, "BitScanRev() cannot return -1, as size >= stepfactor > 0");
+        MALLOC_ASSERT(sizeExp >= StepFactorExp, "sizeExp >= StepFactorExp, because size >= stepFactor");
         int minorStepExp = sizeExp - StepFactorExp;
-        MALLOC_ASSERT(minorStepExp >= 0, "A shift amount (minorStepExp) must not be negative");
+
+        size_t majorStepSize = 1ULL << sizeExp;
         int minorIdx = (size - majorStepSize) >> minorStepExp;
         MALLOC_ASSERT(size == majorStepSize + ((size_t)minorIdx << minorStepExp),
             "Size is not aligned on the bin");
diff --git a/third-party/tbb/src/tbbmalloc/tbbmalloc_internal.h b/third-party/tbb/src/tbbmalloc/tbbmalloc_internal.h
index 44fa47aaab..bc0ee2ffb5 100644
--- a/third-party/tbb/src/tbbmalloc/tbbmalloc_internal.h
+++ b/third-party/tbb/src/tbbmalloc/tbbmalloc_internal.h
@@ -232,9 +232,13 @@ template<unsigned NUM>
 class BitMaskMax : public BitMaskBasic<NUM> {
 public:
     void set(size_t idx, bool val) {
+        MALLOC_ASSERT(NUM >= idx + 1, ASSERT_TEXT);
+
         BitMaskBasic<NUM>::set(NUM - 1 - idx, val);
     }
     int getMaxTrue(unsigned startIdx) const {
+        MALLOC_ASSERT(NUM >= startIdx + 1, ASSERT_TEXT);
+
         int p = BitMaskBasic<NUM>::getMinTrue(NUM-startIdx-1);
         return -1==p? -1 : (int)NUM - 1 - p;
     }
@@ -496,7 +500,11 @@ class HugePagesStatus {
         MALLOC_ASSERT(!pageSize, "Huge page size can't be set twice. Double initialization.");
 
         // Initialize object variables
-        pageSize       = hugePageSize * 1024; // was read in KB from meminfo
+        if (hugePageSize > -1) {
+            pageSize = hugePageSize * 1024; // was read in KB from meminfo
+        } else {
+            pageSize = 0;
+        }
         isHPAvailable  = hpAvailable;
         isTHPAvailable = thpAvailable;
     }
diff --git a/third-party/tbb/src/tbbmalloc_proxy/proxy.cpp b/third-party/tbb/src/tbbmalloc_proxy/proxy.cpp
index 23b9c19c1c..954583ba5f 100644
--- a/third-party/tbb/src/tbbmalloc_proxy/proxy.cpp
+++ b/third-party/tbb/src/tbbmalloc_proxy/proxy.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -431,14 +431,12 @@ void __TBB_malloc__free_base(void *ptr)
 const char* known_bytecodes[] = {
 #if _WIN64
 //  "========================================================" - 56 symbols
+    "E9********CCCC",         // multiple - jmp(0xE9) with address followed by empty space (0xCC - INT 3)
     "4883EC284885C974",       // release free()
     "4883EC284885C975",       // release _msize()
     "4885C974375348",         // release free() 8.0.50727.42, 10.0
-    "E907000000CCCC",         // release _aligned_msize(), _aligned_free() ucrtbase.dll
     "C7442410000000008B",     // release free() ucrtbase.dll 10.0.14393.33
-    "E90B000000CCCC",         // release _msize() ucrtbase.dll 10.0.14393.33
     "48895C24085748",         // release _aligned_msize() ucrtbase.dll 10.0.14393.33
-    "E903000000CCCC",         // release _aligned_msize() ucrtbase.dll 10.0.16299.522
     "48894C24084883EC28BA",   // debug prologue
     "4C894424184889542410",   // debug _aligned_msize() 10.0
     "48894C24084883EC2848",   // debug _aligned_free 10.0
@@ -602,8 +600,8 @@ _expand (by dummy implementation)
 ??_V@YAXPEAX@Z    void * operator new[](unsigned __int64) (intel64)
 ??3@YAXPEAX@Z     operator delete                         (intel64)
 ??_V@YAXPEAX@Z    operator delete[]                       (intel64)
-??2@YAPAXIABUnothrow_t@std@@@Z      void * operator new (size_t sz, const std::nothrow_t&) throw()  (optional)
-??_U@YAPAXIABUnothrow_t@std@@@Z     void * operator new[] (size_t sz, const std::nothrow_t&) throw() (optional)
+??2@YAPAXIABUnothrow_t@std@@@Z      void * operator new (size_t sz, const std::nothrow_t&) noexcept  (optional)
+??_U@YAPAXIABUnothrow_t@std@@@Z     void * operator new[] (size_t sz, const std::nothrow_t&) noexcept (optional)
 
 and these functions have runtime-specific replacement:
 realloc
diff --git a/third-party/tbb/test/CMakeLists.txt b/third-party/tbb/test/CMakeLists.txt
index cfde681bd6..fb4a78bdbb 100644
--- a/third-party/tbb/test/CMakeLists.txt
+++ b/third-party/tbb/test/CMakeLists.txt
@@ -79,7 +79,8 @@ function(tbb_add_test)
         $<$<CONFIG:DEBUG>:TBB_USE_DEBUG>
         $<$<BOOL:${TBB_CPF}>:__TBB_CPF_BUILD=1>
         $<$<NOT:$<BOOL:${BUILD_SHARED_LIBS}>>:__TBB_DYNAMIC_LOAD_ENABLED=0>
-        $<$<NOT:$<BOOL:${BUILD_SHARED_LIBS}>>:__TBB_SOURCE_DIRECTLY_INCLUDED=1>)
+        $<$<NOT:$<BOOL:${BUILD_SHARED_LIBS}>>:__TBB_SOURCE_DIRECTLY_INCLUDED=1>
+        $<$<BOOL:${TBB_TCM_TESTING}>:__TBB_TCM_TESTING_ENABLED=1>)
 
     target_link_libraries(${_tbb_test_TARGET_NAME} PRIVATE ${_tbb_test_DEPENDENCIES} Threads::Threads ${TBB_COMMON_LINK_LIBS})
 
@@ -380,7 +381,9 @@ if (TARGET TBB::tbb)
     # Define the tests
     tbb_add_test(SUBDIR tbb NAME test_tick_count DEPENDENCIES TBB::tbb)
     tbb_add_test(SUBDIR tbb NAME test_allocators DEPENDENCIES TBB::tbb)
-    tbb_add_test(SUBDIR tbb NAME test_arena_priorities DEPENDENCIES TBB::tbb)
+    if (NOT TBB_TCM_TESTING)
+        tbb_add_test(SUBDIR tbb NAME test_arena_priorities DEPENDENCIES TBB::tbb)
+    endif()
     tbb_add_test(SUBDIR tbb NAME test_dynamic_link DEPENDENCIES TBB::tbb)
     if (LINKER_HAS_NO_AS_NEEDED)
         # The linker may not detect a dependency on pthread in static variable constructors.
@@ -389,7 +392,7 @@ if (TARGET TBB::tbb)
     if (APPLE OR ANDROID_PLATFORM)
         target_link_libraries(test_dynamic_link PRIVATE -rdynamic)
     endif()
-    if (WIN32)
+    if (WIN32 AND NOT TBB_TCM_TESTING)
         tbb_add_test(SUBDIR tbb NAME test_numa_dist DEPENDENCIES TBB::tbb)
     endif()
     tbb_add_test(SUBDIR tbb NAME test_collaborative_call_once DEPENDENCIES TBB::tbb)
@@ -451,8 +454,23 @@ if (TARGET TBB::tbb)
     tbb_add_test(SUBDIR tbb NAME test_environment_whitebox DEPENDENCIES TBB::tbb)
     tbb_add_test(SUBDIR tbb NAME test_hw_concurrency DEPENDENCIES TBB::tbb)
     tbb_add_test(SUBDIR tbb NAME test_eh_thread DEPENDENCIES TBB::tbb)
-    tbb_add_test(SUBDIR tbb NAME test_global_control DEPENDENCIES TBB::tbb)
+    if (NOT TBB_TCM_TESTING)
+        tbb_add_test(SUBDIR tbb NAME test_global_control DEPENDENCIES TBB::tbb)
+    endif()
     tbb_add_test(SUBDIR tbb NAME test_task DEPENDENCIES TBB::tbb)
+    if (TBB_TCM_TESTING AND NOT WINDOWS_STORE AND NOT TBB_WINDOWS_DRIVER)
+        add_test(NAME test_tcm_enabled COMMAND test_task --force-colors=1 WORKING_DIRECTORY ${TBB_TEST_WORKING_DIRECTORY})
+        set_tests_properties(test_tcm_enabled PROPERTIES
+            ENVIRONMENT "TBB_VERSION=1;TCM_ENABLE=1"
+            PASS_REGULAR_EXPRESSION "TCM: VERSION.*"
+            FAIL_REGULAR_EXPRESSION "TCM: TCM *disabled"
+        )
+        add_test(NAME test_tcm_disabled COMMAND test_task --force-colors=1 WORKING_DIRECTORY ${TBB_TEST_WORKING_DIRECTORY})
+        set_tests_properties(test_tcm_disabled PROPERTIES
+            ENVIRONMENT "TBB_VERSION=1;TCM_ENABLE=0"
+            PASS_REGULAR_EXPRESSION "TCM: TCM *disabled"
+        )
+    endif()
 
     if (TBB_FUZZ_TESTING AND NOT WIN32)
         if (NOT ((CMAKE_CXX_COMPILER_ID STREQUAL Clang) OR (CMAKE_CXX_COMPILER_ID STREQUAL IntelLLVM)))
@@ -521,7 +539,9 @@ if (TARGET TBB::tbb)
     tbb_add_test(SUBDIR conformance NAME conformance_blocked_range3d DEPENDENCIES TBB::tbb)
     tbb_add_test(SUBDIR conformance NAME conformance_blocked_rangeNd DEPENDENCIES TBB::tbb)
     tbb_add_test(SUBDIR conformance NAME conformance_concurrent_vector DEPENDENCIES TBB::tbb)
-    tbb_add_test(SUBDIR conformance NAME conformance_global_control DEPENDENCIES TBB::tbb)
+    if (NOT TBB_TCM_TESTING)
+        tbb_add_test(SUBDIR conformance NAME conformance_global_control DEPENDENCIES TBB::tbb)
+    endif()
     tbb_add_test(SUBDIR conformance NAME conformance_concurrent_hash_map DEPENDENCIES TBB::tbb)
     tbb_add_test(SUBDIR conformance NAME conformance_enumerable_thread_specific DEPENDENCIES TBB::tbb)
     tbb_add_test(SUBDIR conformance NAME conformance_combinable DEPENDENCIES TBB::tbb)
diff --git a/third-party/tbb/test/common/exception_handling.h b/third-party/tbb/test/common/exception_handling.h
index 55dbe0fc20..1d1b62c3ba 100644
--- a/third-party/tbb/test/common/exception_handling.h
+++ b/third-party/tbb/test/common/exception_handling.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -102,7 +102,7 @@ class test_exception : public std::exception {
 public:
     test_exception ( const char* description ) : my_description(description) {}
 
-    const char* what() const throw() override { return my_description; }
+    const char* what() const noexcept override { return my_description; }
 };
 
 class solitary_test_exception : public test_exception {
diff --git a/third-party/tbb/test/common/graph_utils.h b/third-party/tbb/test/common/graph_utils.h
index 24814d5fd3..2c2099f6df 100644
--- a/third-party/tbb/test/common/graph_utils.h
+++ b/third-party/tbb/test/common/graph_utils.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -35,7 +35,7 @@
 
 #include "common/spin_barrier.h"
 
-using tbb::detail::d1::SUCCESSFULLY_ENQUEUED;
+using tbb::detail::d2::SUCCESSFULLY_ENQUEUED;
 
 // Needed conversion to and from continue_msg, but didn't want to add
 // conversion operators to the class, since we don't want it in general,
@@ -277,11 +277,17 @@ struct harness_counting_receiver : public tbb::flow::receiver<T> {
         return my_graph;
     }
 
-    tbb::detail::d1::graph_task *try_put_task( const T & ) override {
+    tbb::detail::d2::graph_task *try_put_task( const T & ) override {
       ++my_count;
-      return const_cast<tbb::detail::d1::graph_task*>(SUCCESSFULLY_ENQUEUED);
+      return const_cast<tbb::detail::d2::graph_task*>(SUCCESSFULLY_ENQUEUED);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    tbb::detail::d2::graph_task *try_put_task( const T &t, const tbb::detail::d2::message_metainfo& ) override {
+      return try_put_task(t);
+    }
+#endif
+
     void validate() {
         size_t n = my_count;
         CHECK( n == num_copies*max_value );
@@ -323,14 +329,20 @@ struct harness_mapped_receiver : public tbb::flow::receiver<T> {
        my_multiset = new multiset_type;
     }
 
-    tbb::detail::d1::graph_task* try_put_task( const T &t ) override {
+    tbb::detail::d2::graph_task* try_put_task( const T &t ) override {
       if ( my_multiset ) {
           (*my_multiset).emplace( t );
       } else {
           ++my_count;
       }
-      return const_cast<tbb::detail::d1::graph_task*>(SUCCESSFULLY_ENQUEUED);
+      return const_cast<tbb::detail::d2::graph_task*>(SUCCESSFULLY_ENQUEUED);
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    tbb::detail::d2::graph_task *try_put_task( const T &t, const tbb::detail::d2::message_metainfo& ) override {
+      return try_put_task(t);
     }
+#endif
 
     tbb::flow::graph& graph_reference() const override {
         return my_graph;
@@ -404,6 +416,12 @@ struct harness_counting_sender : public tbb::flow::sender<T> {
         }
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    bool try_get( T & v, tbb::detail::d2::message_metainfo& ) override {
+        return try_get(v);
+    }
+#endif
+
     bool try_put_once() {
         successor_type *s = my_receiver;
         size_t i = my_count++;
@@ -842,7 +860,7 @@ struct throwing_body{
         if(my_counter == Threshold)
             throw Threshold;
     }
-    
+
     template<typename input_type>
     output_tuple_type operator()(const input_type&) {
         ++my_counter;
diff --git a/third-party/tbb/test/common/utils_dynamic_libs.h b/third-party/tbb/test/common/utils_dynamic_libs.h
index 5e5365fc8f..99afca3840 100644
--- a/third-party/tbb/test/common/utils_dynamic_libs.h
+++ b/third-party/tbb/test/common/utils_dynamic_libs.h
@@ -58,7 +58,7 @@ namespace utils {
 #define EXT ".dylib"
 #endif
 // Android SDK build system does not support .so file name versioning
-#elif __FreeBSD__ || __NetBSD__ || __sun || _AIX || __ANDROID__
+#elif __FreeBSD__ || __NetBSD__ || __OpenBSD__ || __sun || _AIX || __ANDROID__
 #define EXT ".so"
 #elif __unix__  // Order of these elif's matters!
 #define EXT __TBB_STRING(.so.2)
diff --git a/third-party/tbb/test/conformance/conformance_concurrent_hash_map.cpp b/third-party/tbb/test/conformance/conformance_concurrent_hash_map.cpp
index 0c3ec6e93a..889739b9d0 100644
--- a/third-party/tbb/test/conformance/conformance_concurrent_hash_map.cpp
+++ b/third-party/tbb/test/conformance/conformance_concurrent_hash_map.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -35,8 +35,8 @@
     that concurrent_hash_map uses only the required interface. */
 class MyException : public std::bad_alloc {
 public:
-    virtual const char *what() const throw() override { return "out of items limit"; }
-    virtual ~MyException() throw() {}
+    virtual const char *what() const noexcept override { return "out of items limit"; }
+    virtual ~MyException() noexcept {}
 };
 
 /** Has tightly controlled interface so that we can verify
diff --git a/third-party/tbb/test/conformance/conformance_concurrent_queue.cpp b/third-party/tbb/test/conformance/conformance_concurrent_queue.cpp
index 32c1652e94..9bda9a4613 100644
--- a/third-party/tbb/test/conformance/conformance_concurrent_queue.cpp
+++ b/third-party/tbb/test/conformance/conformance_concurrent_queue.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -777,8 +777,8 @@ void TestConcurrentPushPop() {
 
 class Foo_exception : public std::bad_alloc {
 public:
-    virtual const char *what() const throw() override { return "out of Foo limit"; }
-    virtual ~Foo_exception() throw() {}
+    virtual const char *what() const noexcept override { return "out of Foo limit"; }
+    virtual ~Foo_exception() noexcept {}
 };
 
 #if TBB_USE_EXCEPTIONS
diff --git a/third-party/tbb/test/conformance/conformance_global_control.cpp b/third-party/tbb/test/conformance/conformance_global_control.cpp
index 578ae78019..250fda906b 100644
--- a/third-party/tbb/test/conformance/conformance_global_control.cpp
+++ b/third-party/tbb/test/conformance/conformance_global_control.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include "common/spin_barrier.h"
 #include "common/utils.h"
 #include "common/utils_concurrency_limit.h"
+#include "common/cpu_usertime.h"
 
 #include "oneapi/tbb/global_control.h"
 #include "oneapi/tbb/parallel_for.h"
@@ -347,6 +348,30 @@ TEST_CASE("simple prolong lifetime 3") {
     tbb::parallel_for(0, 10, utils::DummyBody());
 }
 
+//! \brief \ref regression \ref interface \ref requirement
+TEST_CASE("Test worker threads remain inactive in enforced serial execution mode") {
+    auto num_threads = utils::get_platform_max_threads();
+    utils::SpinBarrier barrier{num_threads};
+
+    // Warm-up threads
+    tbb::parallel_for(std::size_t(0), num_threads, [&] (std::size_t) {
+        barrier.wait();
+    });
+
+    tbb::global_control control(tbb::global_control::max_allowed_parallelism, 1);
+
+    std::thread thr([&] {
+        tbb::parallel_for(0, 100000, [&] (int) {
+            utils::doDummyWork(100);
+        });
+    });
+
+    // Workers should sleep because of global_control enforced serial execution of tasks
+    TestCPUUserTime(utils::get_platform_max_threads() - 1);
+
+    thr.join();
+}
+
 // The test cannot work correctly with statically linked runtime.
 // TODO: investigate a failure in debug with MSVC
 #if (!_MSC_VER || (defined(_DLL) && !defined(_DEBUG))) && !EMSCRIPTEN
diff --git a/third-party/tbb/test/conformance/conformance_graph.cpp b/third-party/tbb/test/conformance/conformance_graph.cpp
index 3492660783..1f73999ff7 100644
--- a/third-party/tbb/test/conformance/conformance_graph.cpp
+++ b/third-party/tbb/test/conformance/conformance_graph.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2020-2021 Intel Corporation
+    Copyright (c) 2020-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -158,6 +158,8 @@ void test_join_node_rf_reset_protocol(){
     CHECK_MESSAGE((!testing_node.try_get(tmp)), "All buffers must be emptied");
 }
 
+// global_control::max_allowed_parallelism functionality is not covered by TCM
+#if !__TBB_TCM_TESTING_ENABLED
 //! Graph reset
 //! \brief \ref requirement
 TEST_CASE("graph reset with rf_reset_protocol") {
@@ -179,6 +181,7 @@ TEST_CASE("graph reset with rf_reset_protocol") {
     test_limiter_node_rf_reset_protocol();
     test_join_node_rf_reset_protocol();
 }
+#endif
 
 //! Graph reset rf_clear_edges
 //! \brief \ref requirement
diff --git a/third-party/tbb/test/tbb/test_broadcast_node.cpp b/third-party/tbb/test/tbb/test_broadcast_node.cpp
index b3905e6d60..662a08331d 100644
--- a/third-party/tbb/test/tbb/test_broadcast_node.cpp
+++ b/third-party/tbb/test/tbb/test_broadcast_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -29,7 +29,7 @@
 //! \brief Test for [flow_graph.broadcast_node] specification
 
 
-#define TBB_INTERNAL_NAMESPACE detail::d1
+#define TBB_INTERNAL_NAMESPACE detail::d2
 namespace tbb {
 using task = TBB_INTERNAL_NAMESPACE::graph_task;
 }
@@ -73,6 +73,12 @@ class counting_array_receiver : public tbb::flow::receiver<T> {
         return const_cast<tbb::task *>(SUCCESSFULLY_ENQUEUED);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    tbb::task * try_put_task( const T &v, const tbb::detail::d2::message_metainfo& ) override {
+        return try_put_task(v);
+    }
+#endif
+
     tbb::flow::graph& graph_reference() const override {
         return my_graph;
     }
@@ -241,6 +247,166 @@ void test_deduction_guides() {
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+// Basic idea of the following tests is to check that try_put_and_wait(message) call for broadcast_node
+// processes all of the previous jobs required to process message, the message itself, but does
+// not process the elements submitted later or not required to process the message
+// These tests submit start_work_items using the regular try_put and then submit wait_message
+// with try_put_and_wait. During the completion of the graph, new_work_items would be submitted
+// once the wait_message arrives.
+void test_try_put_and_wait_spawning_and_serial_receiver() {
+    tbb::task_arena arena(1);
+
+    arena.execute([&]{
+        tbb::flow::graph g;
+
+        std::vector<int> start_work_items;
+        std::vector<int> processed_items_unlimited, processed_items_serial;
+        std::vector<int> new_work_items;
+
+        int wait_message = 10;
+
+        for (int i = 0; i < wait_message; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message);
+        }
+
+        tbb::flow::broadcast_node<int> broadcast(g);
+
+        // Broadcast to 2 function_nodes, one with unlimited concurrency and the other serial
+        tbb::flow::function_node<int, int, tbb::flow::queueing> unlimited(g, tbb::flow::unlimited,
+            [&](int input) noexcept {
+                if (input == wait_message) {
+                    for (auto item : new_work_items) {
+                        broadcast.try_put(item);
+                    }
+                }
+                processed_items_unlimited.emplace_back(input);
+                return 0;
+            });
+        tbb::flow::make_edge(broadcast, unlimited);
+
+        tbb::flow::function_node<int, int, tbb::flow::queueing> serial(g, tbb::flow::serial,
+            [&](int input) noexcept {
+                processed_items_serial.emplace_back(input);
+                return 0;
+            });
+        tbb::flow::make_edge(broadcast, serial);
+
+        for (int i = 0; i < wait_message; ++i) {
+            broadcast.try_put(i);
+        }
+
+        broadcast.try_put_and_wait(wait_message);
+
+        size_t unlimited_check_index = 0, serial_check_index = 0;
+
+        // For the unlimited function_node, all of the tasks for start_work_items and wait_message would be spawned
+        // and hence processed by the thread in LIFO order.
+        // The first processed item is expected to be wait_message since it was spawned last
+        CHECK_MESSAGE(processed_items_unlimited.size() == new_work_items.size() + start_work_items.size(),
+                      "Unexpected number of processed items");
+        CHECK_MESSAGE(processed_items_unlimited[unlimited_check_index++] == wait_message, "Unexpected items processing");
+        for (int i = int(new_work_items.size()) - 1; i >= 0; --i) {
+            CHECK_MESSAGE(processed_items_unlimited[unlimited_check_index++] == new_work_items[i], "Unexpected items processing");
+        }
+        for (int i = int(start_work_items.size()) - 1; i >= 1; --i) {
+            CHECK_MESSAGE(processed_items_unlimited[unlimited_check_index++] == start_work_items[i], "Unexpected items processing");
+        }
+
+        // Serial queueing function_node should add all start_work_items except the first one into the queue
+        // and then process them in FIFO order.
+        // wait_message would also be added to the queue, but would be processed later
+        CHECK_MESSAGE(processed_items_serial.size() == start_work_items.size() + 1,
+                      "Unexpected number of processed items");
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items_serial[serial_check_index++] == item, "Unexpected items processing");
+        }
+        CHECK_MESSAGE(processed_items_serial[serial_check_index++] == wait_message, "Unexpected items processing");
+
+        g.wait_for_all();
+
+        CHECK_MESSAGE(processed_items_unlimited[unlimited_check_index++] == start_work_items[0], "Unexpected items processing");
+
+        // For serial queueing function_node, the new_work_items are expected to be processed while calling to wait_for_all
+        // They would be queued and processed later in FIFO order
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items_serial[serial_check_index++] == item, "Unexpected items processing");
+        }
+        CHECK(serial_check_index == processed_items_serial.size());
+        CHECK(unlimited_check_index == processed_items_unlimited.size());
+    });
+}
+
+void test_try_put_and_wait_spawning_receivers() {
+    tbb::task_arena arena(1);
+
+    arena.execute([&]{
+        tbb::flow::graph g;
+
+        int wait_message = 10;
+        int num_successors = wait_message - 1;
+
+        std::vector<int> start_work_items;
+        std::vector<std::vector<int>> processed_items(num_successors);
+        std::vector<int> new_work_items;
+
+        for (int i = 0; i < wait_message; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message);
+        }
+
+        tbb::flow::broadcast_node<int> broadcast(g);
+
+        std::vector<tbb::flow::function_node<int, int, tbb::flow::queueing>> successors;
+        successors.reserve(num_successors);
+        for (int i = 0; i < num_successors; ++i) {
+            successors.emplace_back(g, tbb::flow::unlimited,
+                [&, i](int input) noexcept {
+                    if (input == wait_message) {
+                        broadcast.try_put(new_work_items[i]);
+                    }
+                    processed_items[i].emplace_back(input);
+                    return 0;
+                });
+            tbb::flow::make_edge(broadcast, successors.back());
+        }
+
+        for (int i = 0; i < wait_message; ++i) {
+            broadcast.try_put(i);
+        }
+
+        broadcast.try_put_and_wait(wait_message);
+
+        for (int i = num_successors - 1; i >= 0; --i) {
+            size_t check_index = 0;
+            for (int j = num_successors - 1; j != i; --j) {
+                CHECK_MESSAGE(processed_items[i][check_index++] == new_work_items[j], "Unexpected items processing");
+            }
+            CHECK_MESSAGE(processed_items[i][check_index++] == wait_message, "Unexpected items processing");
+            for (int j = i; j >= 1; --j) {
+                CHECK_MESSAGE(processed_items[i][check_index++] == new_work_items[j], "Unexpected items processing");
+            }
+        }
+
+        g.wait_for_all();
+
+        for (auto& processed_item : processed_items) {
+            size_t check_index = num_successors;
+            CHECK_MESSAGE(processed_item[check_index++] == new_work_items[0], "Unexpected items processing");
+            for (int i = int(start_work_items.size()) - 1; i >= 0; --i) {
+                CHECK_MESSAGE(processed_item[check_index++] == start_work_items[i], "Unexpected items processing");
+            }
+        }
+    });
+}
+
+void test_try_put_and_wait() {
+    test_try_put_and_wait_spawning_and_serial_receiver();
+    test_try_put_and_wait_spawning_receivers();
+}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
 //! Test serial broadcasts
 //! \brief \ref error_guessing
 TEST_CASE("Serial broadcasts"){
@@ -282,3 +448,9 @@ TEST_CASE("Deduction guides"){
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test broadcast_node try_put_and_wait") {
+    test_try_put_and_wait();
+}
+#endif
diff --git a/third-party/tbb/test/tbb/test_buffer_node.cpp b/third-party/tbb/test/tbb/test_buffer_node.cpp
index 89f4485b3d..527005aecb 100644
--- a/third-party/tbb/test/tbb/test_buffer_node.cpp
+++ b/third-party/tbb/test/tbb/test_buffer_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -24,11 +24,11 @@
 #include "common/graph_utils.h"
 #include "common/test_follows_and_precedes_api.h"
 
+#include "test_buffering_try_put_and_wait.h"
 
 //! \file test_buffer_node.cpp
 //! \brief Test for [flow_graph.buffer_node] specification
 
-
 #define N 1000
 #define C 10
 
@@ -307,7 +307,7 @@ int test_parallel(int num_threads) {
 // Chained buffers ( 2 & 3 ), single sender, items at last buffer in arbitrary order
 //
 
-#define TBB_INTERNAL_NAMESPACE detail::d1
+#define TBB_INTERNAL_NAMESPACE detail::d2
 using tbb::TBB_INTERNAL_NAMESPACE::register_predecessor;
 using tbb::TBB_INTERNAL_NAMESPACE::remove_predecessor;
 
@@ -455,6 +455,161 @@ void test_deduction_guides() {
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+void test_buffer_node_try_put_and_wait() {
+    using namespace test_try_put_and_wait;
+
+    std::vector<int> start_work_items;
+    std::vector<int> new_work_items;
+    int wait_message = 10;
+
+    for (int i = 0; i < wait_message; ++i) {
+        start_work_items.emplace_back(i);
+        new_work_items.emplace_back(i + 1 + wait_message);
+    }
+
+    // Test push
+    // test_buffer_push tests the graph
+    // buffer1 -> function -> buffer2 -> writer
+    //     function is a queueing serial function_node that submits new_work_items once wait_message arrives
+    //     writer is an unlimited function_node that writes an item into the processed_items vector
+    // Test steps
+    //     1. push start_work_items into the buffer1
+    //     2. buffer1.try_put_and_wait(wait_message);
+    //     3. g.wait_for_all()
+    // test_buffer_push returns the index from which the items processed during wait_for_all() starts
+    {
+        std::vector<int> processed_items;
+
+        std::size_t after_start = test_buffer_push<tbb::flow::buffer_node<int>>(start_work_items, wait_message,
+                                                                                new_work_items, processed_items);
+
+        // Expected effect:
+        // During buffer1.try_put_and_wait()
+        //     1. start_work_items would be pushed to buffer1
+        //     2. wait_message would be pushed to buffer1
+        //     3. forward_task on buffer1 would transfer all of the items to the function_node in LIFO order
+        //     4. wait_message would occupy concurrency of function, other items would be pushed to the queue
+        //     5. function would process wait_message and add new_work_items to the buffer1
+        //     6. forward_task for new_work_items would be spawned, wait_message would be buffered in the buffer2
+        //     7. function task for next FIFO item in the queue would be spawned
+        //     8. forward_task for wait_message in buffer2 would be executed without spawning
+        //     9. writer task for wait_message would be executed without spawning and write wait_message to the buffer
+        //     10. try_put_and_wait exits since wait_message is completed
+        // During g.wait_for_all()
+        //     10. forward_task for new_work_items in buffer1 would be spawned and put items in function in LIFO order
+        //     11. function_node would process and push forward items from the queue in FIFO order
+        // Expected items processing - { wait_message, start_work_items LIFO, new_work_items LIFO }
+
+        std::size_t check_index = 0;
+        CHECK_MESSAGE(after_start == 1, "try_put_and_wait should process only the wait_message");
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "try_put_and_wait should process only the wait_message");
+
+        for (std::size_t index = start_work_items.size(); index != 0; --index) {
+            CHECK_MESSAGE(processed_items[check_index++] == start_work_items[index - 1],
+                          "wait_for_all should process start_work_items LIFO");
+        }
+        for (std::size_t index = new_work_items.size(); index != 0; --index) {
+            CHECK_MESSAGE(processed_items[check_index++] == new_work_items[index - 1],
+                          "wait_for_all should process new_work_items LIFO");
+        }
+        CHECK(check_index == processed_items.size());
+    } // Test push
+
+    // Test pull
+    // test_buffer_pull tests the graph
+    // buffer -> function
+    //     function is a rejecting serial function_node that submits new_work_items once wait_message arrives
+    //     and writes the processed item into the processed_items
+    // Test steps
+    //     1. push the occupier message to the function
+    //     2. push start_work_items into the buffer
+    //     3. buffer.try_put_and_wait(wait_message)
+    //     4. g.wait_for_all()
+    // test_buffer_pull returns the index from which the items processed during wait_for_all() starts
+
+    {
+        std::vector<int> processed_items;
+        int occupier = 42;
+
+        std::size_t after_start = test_buffer_pull<tbb::flow::buffer_node<int>>(start_work_items, wait_message, occupier,
+                                                                                new_work_items, processed_items);
+
+        // Expected effect
+        // 0. task for occupier processing would be spawned by the function
+        // During buffer.try_put_and_wait()
+        //     1. start_work_items would be pushed to the buffer
+        //     2. wait_message would be pushed to the buffer
+        //     3. forward_task would try to push items to the function, but would fail
+        //        and set the edge to the pull state
+        //     4. occupier would be processed
+        //     5. items would be taken from the buffer by function in LIFO order
+        //     6. wait_message would be taken first and push new_work_items to the buffer
+        // Expected items processing { occupier, wait_message, new_work_items LIFO, start_work_items LIFO }
+
+        std::size_t check_index = 0;
+
+        CHECK_MESSAGE(after_start == 2, "Only wait_message and occupier should be processed by try_put_and_wait");
+        CHECK_MESSAGE(processed_items[check_index++] == occupier, "Unexpected items processing by try_put_and_wait");
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected items processing by try_put_and_wait");
+
+        for (std::size_t index = new_work_items.size(); index != 0; --index) {
+            CHECK_MESSAGE(processed_items[check_index++] == new_work_items[index - 1],
+                          "wait_for_all should process new_work_items LIFO");
+        }
+        for (std::size_t index = start_work_items.size(); index != 0; --index) {
+            CHECK_MESSAGE(processed_items[check_index++] == start_work_items[index - 1],
+                          "wait_for_all should process start_work_items LIFO");
+        }
+        CHECK(check_index == processed_items.size());
+    }
+
+    // Test reserve
+    {
+        int thresholds[] = { 1, 2 };
+
+        for (int threshold : thresholds) {
+            std::vector<int> processed_items;
+
+            // test_buffer_reserve tests the following graph
+            // buffer -> limiter -> function
+            //  function is a rejecting serial function_node that puts an item to the decrementer port
+            //  of the limiter inside of the body
+
+            std::size_t after_start = test_buffer_reserve<tbb::flow::buffer_node<int>>(threshold,
+                start_work_items, wait_message, new_work_items, processed_items);
+
+            // Expected effect:
+            // 1. start_work_items would be pushed to the buffer
+            // 2. wait_message_would be pushed to the buffer
+            // 3. forward task of the buffer would push wait_message to the limiter node.
+            //    Since the limiter threshold is not reached, it would be directly passed to the function
+            // 4. function would spawn the task for wait_message processing
+            // 5. wait_message would be processed that would add new_work_items to the buffer
+            // 6. decrementer.try_put() would be called and the limiter node would
+            //    process all of the items from the buffer using the try_reserve/try_consume/try_release semantics
+            // Since the reservation always accepts the front element of the buffer
+            // it is expected that the items would be taken from the buffer in FIFO order
+            // instead of LIFO on try_get for buffer_node
+
+            std::size_t check_index = 0;
+
+            CHECK_MESSAGE(after_start == 1, "try_put_and_wait should process only wait_message");
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected wait_message processing");
+
+            for (auto item : start_work_items) {
+                CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+            }
+
+            for (auto item : new_work_items) {
+                CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+            }
+
+        }
+    }
+}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
 #include <iomanip>
 
 //! Test buffer_node with parallel and serial neighbours
@@ -489,8 +644,15 @@ TEST_CASE("Follows and precedes API"){
 
 #if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
 //! Test deduction guides
-//! \brief requirement
+//! \brief \ref requirement
 TEST_CASE("Deduction guides"){
     test_deduction_guides();
 }
 #endif
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test buffer_node try_put_and_wait") {
+    test_buffer_node_try_put_and_wait();
+}
+#endif
diff --git a/third-party/tbb/test/tbb/test_buffering_try_put_and_wait.h b/third-party/tbb/test/tbb/test_buffering_try_put_and_wait.h
new file mode 100644
index 0000000000..300521233f
--- /dev/null
+++ b/third-party/tbb/test/tbb/test_buffering_try_put_and_wait.h
@@ -0,0 +1,189 @@
+/*
+    Copyright (c) 2024 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_test_tbb_buffering_try_put_and_wait_H
+#define __TBB_test_tbb_buffering_try_put_and_wait_H
+
+#include <oneapi/tbb/task_arena.h>
+#include <oneapi/tbb/flow_graph.h>
+
+#include <vector>
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
+namespace test_try_put_and_wait {
+
+template <typename BufferingNode, typename... Args>
+std::size_t test_buffer_push(const std::vector<int>& start_work_items,
+                             int wait_message,
+                             const std::vector<int>& new_work_items,
+                             std::vector<int>& processed_items,
+                             Args... args)
+{
+    std::size_t after_try_put_and_wait_start_index = 0;
+    tbb::task_arena arena(1);
+
+    arena.execute([&] {
+        tbb::flow::graph g;
+
+        using function_node_type = tbb::flow::function_node<int, int>;
+
+        BufferingNode buffer1(g, args...);
+
+        function_node_type function(g, tbb::flow::serial,
+            [&](int input) noexcept {
+                if (input == wait_message) {
+                    for (auto item : new_work_items) {
+                        buffer1.try_put(item);
+                    }
+                }
+                return input;
+            });
+
+        BufferingNode buffer2(g, args...);
+
+        function_node_type writer(g, tbb::flow::unlimited,
+            [&](int input) noexcept {
+                processed_items.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::make_edge(buffer1, function);
+        tbb::flow::make_edge(function, buffer2);
+        tbb::flow::make_edge(buffer2, writer);
+
+        for (auto item : start_work_items) {
+            buffer1.try_put(item);
+        }
+
+        buffer1.try_put_and_wait(wait_message);
+
+        after_try_put_and_wait_start_index = processed_items.size();
+
+        g.wait_for_all();
+    });
+
+    return after_try_put_and_wait_start_index;
+}
+
+template <typename BufferingNode, typename... Args>
+std::size_t test_buffer_pull(const std::vector<int>& start_work_items,
+                             int wait_message,
+                             int occupier,
+                             const std::vector<int>& new_work_items,
+                             std::vector<int>& processed_items,
+                             Args... args)
+{
+    tbb::task_arena arena(1);
+    std::size_t after_try_put_and_wait_start_index = 0;
+
+    arena.execute([&] {
+        tbb::flow::graph g;
+
+        using function_node_type = tbb::flow::function_node<int, int, tbb::flow::rejecting>;
+
+        BufferingNode buffer(g, args...);
+
+        function_node_type function(g, tbb::flow::serial,
+            [&](int input) noexcept {
+                if (input == wait_message) {
+                    for (auto item : new_work_items) {
+                        buffer.try_put(item);
+                    }
+                }
+
+                processed_items.emplace_back(input);
+                return 0;
+            });
+
+        // Occupy the concurrency of function_node
+        // This call spawns the task to process the occupier
+        function.try_put(occupier);
+
+        // Make edge between buffer and function after occupying the concurrency
+        // To ensure that forward task of the buffer would be spawned after the occupier task
+        // And the function_node would reject the items from the buffer
+        // and process them later by calling try_get on the buffer
+        tbb::flow::make_edge(buffer, function);
+
+        for (auto item : start_work_items) {
+            buffer.try_put(item);
+        }
+
+        buffer.try_put_and_wait(wait_message);
+
+        after_try_put_and_wait_start_index = processed_items.size();
+
+        g.wait_for_all();
+    });
+
+    return after_try_put_and_wait_start_index;
+}
+
+template <typename BufferingNode, typename... Args>
+std::size_t test_buffer_reserve(std::size_t limiter_threshold,
+                                const std::vector<int>& start_work_items,
+                                int wait_message,
+                                const std::vector<int>& new_work_items,
+                                std::vector<int>& processed_items,
+                                Args... args)
+{
+    tbb::task_arena arena(1);
+    std::size_t after_try_put_and_wait_start_index = 0;
+
+    arena.execute([&] {
+        tbb::flow::graph g;
+
+        BufferingNode buffer(g, args...);
+
+        tbb::flow::limiter_node<int, int> limiter(g, limiter_threshold);
+        tbb::flow::function_node<int, int, tbb::flow::rejecting> function(g, tbb::flow::serial,
+            [&](int input) {
+                if (input == wait_message) {
+                    for (auto item : new_work_items) {
+                        buffer.try_put(item);
+                    }
+                }
+                // Explicitly put to the decrementer instead of making edge
+                // to guarantee that the next task would be spawned and not returned
+                // to the current thread as the next task
+                // Otherwise, all elements would be processed during the try_put_and_wait
+                limiter.decrementer().try_put(1);
+                processed_items.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::make_edge(buffer, limiter);
+        tbb::flow::make_edge(limiter, function);
+
+        for (auto item : start_work_items) {
+            buffer.try_put(item);
+        }
+
+        buffer.try_put_and_wait(wait_message);
+
+        after_try_put_and_wait_start_index = processed_items.size();
+
+        g.wait_for_all();
+    });
+
+    return after_try_put_and_wait_start_index;
+}
+
+} // test_try_put_and_wait
+
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+#endif // __TBB_test_tbb_buffering_try_put_and_wait_H
diff --git a/third-party/tbb/test/tbb/test_continue_node.cpp b/third-party/tbb/test/tbb/test_continue_node.cpp
index 8c2c5c5bb9..1cfea3df43 100644
--- a/third-party/tbb/test/tbb/test_continue_node.cpp
+++ b/third-party/tbb/test/tbb/test_continue_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -63,7 +63,7 @@ template< typename OutputType >
 void run_continue_nodes( int p, tbb::flow::graph& g, tbb::flow::continue_node< OutputType >& n ) {
     fake_continue_sender fake_sender;
     for (size_t i = 0; i < N; ++i) {
-        tbb::detail::d1::register_predecessor(n, fake_sender);
+        tbb::detail::d2::register_predecessor(n, fake_sender);
     }
 
     for (size_t num_receivers = 1; num_receivers <= MAX_NODES; ++num_receivers ) {
@@ -138,7 +138,7 @@ void continue_nodes_with_copy( ) {
         tbb::flow::continue_node< OutputType > exe_node( g, cf );
         fake_continue_sender fake_sender;
         for (size_t i = 0; i < N; ++i) {
-            tbb::detail::d1::register_predecessor(exe_node, fake_sender);
+            tbb::detail::d2::register_predecessor(exe_node, fake_sender);
         }
 
         for (size_t num_receivers = 1; num_receivers <= MAX_NODES; ++num_receivers ) {
@@ -354,6 +354,176 @@ void test_successor_cache_specialization() {
                   "Wrong number of messages is passed via continue_node");
 }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+void test_try_put_and_wait_default() {
+    tbb::task_arena arena(1);
+
+    arena.execute([&]{
+        tbb::flow::graph g;
+
+        int processed_items = 0;
+
+        tbb::flow::continue_node<tbb::flow::continue_msg>* start_node = nullptr;
+
+        tbb::flow::continue_node<tbb::flow::continue_msg> cont(g,
+            [&](tbb::flow::continue_msg) noexcept {
+                static bool put_ten_msgs = true;
+                if (put_ten_msgs) {
+                    for (std::size_t i = 0; i < 10; ++i) {
+                        start_node->try_put(tbb::flow::continue_msg{});
+                    }
+                    put_ten_msgs = false;
+                }
+            });
+
+        start_node = &cont;
+
+        tbb::flow::continue_node<tbb::flow::continue_msg, tbb::flow::lightweight> writer(g,
+            [&](tbb::flow::continue_msg) noexcept {
+                ++processed_items;
+            });
+
+        tbb::flow::make_edge(cont, writer);
+
+        cont.try_put_and_wait(tbb::flow::continue_msg{});
+
+        // Only 1 item should be processed, with the additional 10 items having been spawned
+        CHECK_MESSAGE(processed_items == 1, "Unexpected items processing");
+
+        g.wait_for_all();
+
+        // The additional 10 items should be processed
+        CHECK_MESSAGE(processed_items == 11, "Unexpected items processing");
+    });
+}
+
+void test_try_put_and_wait_lightweight() {
+    tbb::task_arena arena(1);
+
+    arena.execute([&]{
+        tbb::flow::graph g;
+
+        std::vector<int> start_work_items;
+        std::vector<int> processed_items;
+        std::vector<int> new_work_items;
+
+        int wait_message = 10;
+
+        for (int i = 0; i < wait_message; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message);
+        }
+
+        tbb::flow::continue_node<int, tbb::flow::lightweight>* start_node = nullptr;
+
+        tbb::flow::continue_node<int, tbb::flow::lightweight> cont(g,
+            [&](tbb::flow::continue_msg) noexcept {
+                static int counter = 0;
+                int i = counter++;
+                if (i == wait_message) {
+                    for (auto item : new_work_items) {
+                        (void)item;
+                        start_node->try_put(tbb::flow::continue_msg{});
+                    }
+                }
+                return i;
+            });
+
+        start_node = &cont;
+
+        tbb::flow::function_node<int, int, tbb::flow::lightweight> writer(g, tbb::flow::unlimited,
+            [&](int input) noexcept {
+                processed_items.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::make_edge(cont, writer);
+
+        for (auto item : start_work_items) {
+            (void)item;
+            cont.try_put(tbb::flow::continue_msg{});
+        }
+
+        cont.try_put_and_wait(tbb::flow::continue_msg{});
+
+        CHECK_MESSAGE(processed_items.size() == start_work_items.size() + new_work_items.size() + 1,
+                      "Unexpected number of elements processed");
+
+        std::size_t check_index = 0;
+
+        // For lightweight continue_node, start_work_items are expected to be processed first
+        // while putting items into the first node
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected items processing");
+        }
+
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected items processing");
+        }
+        // wait_message would be processed only after new_work_items
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected items processing");
+
+        g.wait_for_all();
+
+        CHECK(check_index == processed_items.size());
+    });
+}
+
+void test_metainfo_buffering() {
+    tbb::task_arena arena(1);
+
+    arena.execute([&] {
+        tbb::flow::graph g;
+
+        std::vector<char> call_order;
+
+        tbb::flow::continue_node<tbb::flow::continue_msg>* b_ptr = nullptr;
+
+        tbb::flow::continue_node<tbb::flow::continue_msg> a(g,
+            [&](tbb::flow::continue_msg) noexcept {
+                call_order.push_back('A');
+                static std::once_flag flag; // Send a signal to B only in the first call
+                std::call_once(flag, [&]{ b_ptr->try_put(tbb::flow::continue_msg{}); });
+            });
+
+        tbb::flow::continue_node<tbb::flow::continue_msg> b(g,
+            [&](tbb::flow::continue_msg) noexcept {
+                call_order.push_back('B');
+                a.try_put(tbb::flow::continue_msg{});
+            });
+
+        b_ptr = &b;
+
+        tbb::flow::continue_node<tbb::flow::continue_msg, tbb::flow::lightweight> c(g,
+            [&](tbb::flow::continue_msg) noexcept {
+                call_order.push_back('C');
+            });
+
+        tbb::flow::make_edge(a, c);
+        tbb::flow::make_edge(b, c);
+
+        a.try_put_and_wait(tbb::flow::continue_msg{});
+
+        // Inside the first call of A, we send a signal to B.
+        // Both of them send signals to C. Since C lightweight, it is processed immediately
+        // upon receiving signals from both predecessors. This completes the wait.
+        CHECK(call_order == std::vector<char>{'A', 'B', 'C'});
+
+        g.wait_for_all();
+
+        // B previously sent a signal to A, which has now been processed.
+        // A sends a signal to C, which is not processed because no signal is received from B this time.
+        CHECK(call_order == std::vector<char>{'A', 'B', 'C', 'A'});
+    });
+}
+
+void test_try_put_and_wait() {
+    test_try_put_and_wait_default();
+    test_try_put_and_wait_lightweight();
+    test_metainfo_buffering();
+}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
 //! Test concurrent continue_node for correctness
 //! \brief \ref error_guessing
 TEST_CASE("Concurrency testing") {
@@ -418,3 +588,10 @@ TEST_CASE("constraints for continue_node body") {
     static_assert(!can_call_continue_node_ctor<output_type, WrongReturnOperatorRoundBrackets<output_type>>);
 }
 #endif // __TBB_CPP20_CONCEPTS_PRESENT
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test continue_node try_put_and_wait") {
+    test_try_put_and_wait();
+}
+#endif
diff --git a/third-party/tbb/test/tbb/test_eh_flow_graph.cpp b/third-party/tbb/test/tbb/test_eh_flow_graph.cpp
index 015d196eaf..160efe90df 100644
--- a/third-party/tbb/test/tbb/test_eh_flow_graph.cpp
+++ b/third-party/tbb/test/tbb/test_eh_flow_graph.cpp
@@ -482,6 +482,7 @@ void
 run_one_functype_node_test(bool throwException, bool flog, const char * /*name*/) {
 
     std::stringstream ss;
+    std::string ss_str;
     char *saved_msg = const_cast<char *>(g_Wakeup_Msg);
     tbb::flow::graph g;
 
@@ -511,7 +512,8 @@ run_one_functype_node_test(bool throwException, bool flog, const char * /*name*/
     for(int iter = 0; iter < 2; ++iter) {  // run, reset, run again
         ss.clear();
         ss << saved_msg << " iter=" << iter << ", threads=" << g_NumThreads << ", throw=" << (throwException ? "T" : "F") << ", flow=" << (flog ? "T" : "F");
-        g_Wakeup_Msg = ss.str().c_str();
+        ss_str = ss.str();
+        g_Wakeup_Msg = ss_str.c_str();
         ResetGlobals(throwException,flog);
         if(throwException) {
             TRY();
diff --git a/third-party/tbb/test/tbb/test_environment_whitebox.cpp b/third-party/tbb/test/tbb/test_environment_whitebox.cpp
index ecc46e3ac5..9092135da9 100644
--- a/third-party/tbb/test/tbb/test_environment_whitebox.cpp
+++ b/third-party/tbb/test/tbb/test_environment_whitebox.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -130,7 +130,7 @@ std::vector<std::pair<std::string, bool>> initialize_cases( bool wrong_result )
     cases.push_back(std::make_pair("1              ", true));
     cases.push_back(std::make_pair("             1           ", true));
     cases.push_back(std::make_pair("         1", true));
-    cases.push_back(std::make_pair((std::string(large_length, ' ') + '1').c_str(), true));
+    cases.push_back(std::make_pair((std::string(large_length, ' ') + '1'), true));
 
     // Invalid cases
     cases.push_back(std::make_pair("", wrong_result));
@@ -150,7 +150,7 @@ std::vector<std::pair<std::string, bool>> initialize_cases( bool wrong_result )
     cases.push_back(std::make_pair("2018", wrong_result));
     cases.push_back(std::make_pair("ABC_123", wrong_result));
     cases.push_back(std::make_pair("true", wrong_result));
-    cases.push_back(std::make_pair(std::string(large_length, 'A').c_str(), wrong_result));
+    cases.push_back(std::make_pair(std::string(large_length, 'A'), wrong_result));
 
     prepare_random_cases(cases);
     return cases;
@@ -162,27 +162,27 @@ std::vector<std::pair<std::string, long>> initialize_cases( long wrong_result )
     // Valid cases
     for (long i = 0; i < 100; ++i) {
         ss << i;
-        cases.push_back(std::make_pair(ss.str().c_str(), i));
+        cases.push_back(std::make_pair(ss.str(), i));
         ss.str("");
 
         ss << "     " << i << "     ";
-        cases.push_back(std::make_pair(ss.str().c_str(), i));
+        cases.push_back(std::make_pair(ss.str(), i));
         ss.str("");
 
         ss << i << "     ";
-        cases.push_back(std::make_pair(ss.str().c_str(), i));
+        cases.push_back(std::make_pair(ss.str(), i));
         ss.str("");
 
         ss << "     " << i;
-        cases.push_back(std::make_pair(ss.str().c_str(), i));
+        cases.push_back(std::make_pair(ss.str(), i));
         ss.str("");
     }
 
     ss << LONG_MAX;
-    cases.push_back(std::make_pair(ss.str().c_str(), LONG_MAX));
+    cases.push_back(std::make_pair(ss.str(), LONG_MAX));
     ss.str("");
 
-    cases.push_back(std::make_pair((std::string(large_length, ' ') + '1').c_str(), 1L));
+    cases.push_back(std::make_pair((std::string(large_length, ' ') + '1'), 1L));
 
     // Invalid cases
     cases.push_back(std::make_pair("", wrong_result));
@@ -202,11 +202,11 @@ std::vector<std::pair<std::string, long>> initialize_cases( long wrong_result )
     cases.push_back(std::make_pair("false", wrong_result));
     cases.push_back(std::make_pair("1A", wrong_result));
     cases.push_back(std::make_pair("_123", wrong_result));
-    cases.push_back(std::make_pair(std::string(large_length, 'A').c_str(), wrong_result));
+    cases.push_back(std::make_pair(std::string(large_length, 'A'), wrong_result));
 
     // Prepare string with LONG_MAX + 1 value
     ss << LONG_MAX / 10 << (LONG_MAX % 10 + 1);
-    cases.push_back(std::make_pair(ss.str().c_str(), -1));
+    cases.push_back(std::make_pair(ss.str(), -1));
     ss.str("");
 
     prepare_random_cases(cases);
diff --git a/third-party/tbb/test/tbb/test_flow_graph_whitebox.cpp b/third-party/tbb/test/tbb/test_flow_graph_whitebox.cpp
index a3ed03b252..88365d892d 100644
--- a/third-party/tbb/test/tbb/test_flow_graph_whitebox.cpp
+++ b/third-party/tbb/test/tbb/test_flow_graph_whitebox.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -459,7 +459,7 @@ template <>
 struct DecrementerHelper<tbb::flow::continue_msg> {
     template <typename Decrementer>
     static void check(Decrementer& decrementer) {
-        auto& d = static_cast<tbb::detail::d1::continue_receiver&>(decrementer);
+        auto& d = static_cast<tbb::detail::d2::continue_receiver&>(decrementer);
         CHECK_MESSAGE(d.my_predecessor_count == 0, "error in pred count");
         CHECK_MESSAGE(d.my_initial_predecessor_count == 0, "error in initial pred count");
         CHECK_MESSAGE(d.my_current_count == 0, "error in current count");
diff --git a/third-party/tbb/test/tbb/test_function_node.cpp b/third-party/tbb/test/tbb/test_function_node.cpp
index aa7e41ca59..999adac189 100644
--- a/third-party/tbb/test/tbb/test_function_node.cpp
+++ b/third-party/tbb/test/tbb/test_function_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -469,6 +469,261 @@ void test_follows_and_precedes_api() {
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+// Basic idea of the following tests is to check that try_put_and_wait(message) call for function_node
+// with one of the policies (lightweight, queueing and rejecting) with different concurrency limits
+// processes all of the previous jobs required to process message, the message itself, but does
+// not process the elements submitted later or not required to process the message
+// These tests submit start_work_items using the regular try_put and then submit wait_message
+// with try_put_and_wait. During the completion of the graph, new_work_items would be submitted
+// once the wait_message arrives.
+void test_try_put_and_wait_lightweight(std::size_t concurrency_limit) {
+    tbb::task_arena arena(1);
+
+    arena.execute([&]{
+        tbb::flow::graph g;
+
+        std::vector<int> start_work_items;
+        std::vector<int> processed_items;
+        std::vector<int> new_work_items;
+
+        int wait_message = 10;
+
+        for (int i = 0; i < wait_message; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message);
+        }
+
+        using function_node_type = tbb::flow::function_node<int, int, tbb::flow::lightweight>;
+        function_node_type* start_node = nullptr;
+
+        function_node_type function(g, concurrency_limit,
+            [&](int input) noexcept {
+                if (input == wait_message) {
+                    for (int item : new_work_items) {
+                        start_node->try_put(item);
+                    }
+                }
+                return input;
+            });
+
+        start_node = &function;
+
+        function_node_type writer(g, concurrency_limit,
+            [&](int input) noexcept {
+                processed_items.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::make_edge(function, writer);
+
+        for (int i = 0; i < wait_message; ++i) {
+            function.try_put(i);
+        }
+
+        function.try_put_and_wait(wait_message);
+
+        std::size_t check_index = 0;
+
+        // For lightweight function_node, start_work_items are expected to be processed first
+        // while putting items into the first node.
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected items processing");
+        }
+
+        if (concurrency_limit == tbb::flow::serial) {
+            // If the lightweight function_node is serial, it should process the wait_message but add items from new_work_items
+            // into the queue since the concurrency limit is occupied.
+            CHECK_MESSAGE(processed_items.size() == start_work_items.size() + 1, "Unexpected number of elements processed");
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected items processing");
+        } else {
+            // If the node is unlimited, it should process new_work_items immediately while processing the wait_message
+            // Hence they should be processed before exiting the try_put_and_wait
+            CHECK_MESSAGE(processed_items.size() == start_work_items.size() + new_work_items.size() + 1,
+                          "Unexpected number of elements processed");
+            for (auto item : new_work_items) {
+                CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected items processing");
+            }
+            // wait_message would be processed only after new_work_items
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected items processing");
+        }
+
+        g.wait_for_all();
+
+        if (concurrency_limit == tbb::flow::serial) {
+            // For the serial node, processing of new_work_items would be postponed to wait_for_all since they
+            // would be queued and spawned after working with wait_message
+            for (auto item : new_work_items) {
+                CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected items processing");
+            }
+        }
+        CHECK(check_index == processed_items.size());
+    });
+}
+
+void test_try_put_and_wait_queueing(std::size_t concurrency_limit) {
+    tbb::task_arena arena(1);
+    arena.execute([&]{
+        tbb::flow::graph g;
+
+        std::vector<int> start_work_items;
+        std::vector<int> processed_items;
+        std::vector<int> new_work_items;
+
+        int wait_message = 10;
+
+        for (int i = 0; i < wait_message; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message);
+        }
+
+        using function_node_type = tbb::flow::function_node<int, int, tbb::flow::queueing>;
+        function_node_type* start_node = nullptr;
+
+        function_node_type function(g, concurrency_limit,
+            [&](int input) noexcept {
+                if (input == wait_message) {
+                    for (int item : new_work_items) {
+                        start_node->try_put(item);
+                    }
+                }
+                return input;
+            });
+
+        start_node = &function;
+
+        function_node_type writer(g, concurrency_limit,
+            [&](int input) noexcept {
+                processed_items.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::make_edge(function, writer);
+
+        for (int i = 0; i < wait_message; ++i) {
+            function.try_put(i);
+        }
+
+        function.try_put_and_wait(wait_message);
+
+        std::size_t check_index = 0;
+
+        if (concurrency_limit == tbb::flow::serial) {
+            // Serial queueing function_node should add all start_work_items except the first one into the queue
+            // and then process them in FIFO order.
+            // wait_message would also be added to the queue, but would be processed later
+            CHECK_MESSAGE(processed_items.size() == start_work_items.size() + 1, "Unexpected number of elements processed");
+            for (auto item : start_work_items) {
+                CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected items processing");
+            }
+        } else {
+            CHECK_MESSAGE(processed_items.size() == 1, "Unexpected number of elements processed");
+        }
+
+        // For the unlimited function_node, all of the tasks for start_work_items and wait_message would be spawned
+        // and hence processed by the thread in LIFO order.
+        // The first processed item is expected to be wait_message since it was spawned last
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected items processing");
+
+        g.wait_for_all();
+
+        if (concurrency_limit == tbb::flow::serial) {
+            // For serial queueing function_node, the new_work_items are expected to be processed while calling to wait_for_all
+            // They would be queued and processed later in FIFO order
+            for (auto item : new_work_items) {
+                CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected items processing");
+            }
+        } else {
+            // Unlimited function_node would always spawn tasks immediately without adding them into the queue
+            // They would be processed in LIFO order. Hence it is expected that new_work_items would be processed first in reverse order
+            // After them, start_work_items would be processed also in reverse order
+            for (std::size_t i = new_work_items.size(); i != 0; --i) {
+                CHECK_MESSAGE(processed_items[check_index++] == new_work_items[i - 1], "Unexpected items processing");
+            }
+            for (std::size_t i = start_work_items.size(); i != 0; --i) {
+                CHECK_MESSAGE(processed_items[check_index++] == start_work_items[i - 1], "Unexpected items processing");
+            }
+        }
+        CHECK(check_index == processed_items.size());
+    });
+}
+
+void test_try_put_and_wait_rejecting(size_t concurrency_limit) {
+    tbb::task_arena arena(1);
+
+    arena.execute([&]{
+        tbb::flow::graph g;
+
+        std::vector<int> processed_items;
+        std::vector<int> new_work_items;
+
+        int wait_message = 0;
+
+        for (int i = 1; i < wait_message; ++i) {
+            new_work_items.emplace_back(i);
+        }
+
+        using function_node_type = tbb::flow::function_node<int, int, tbb::flow::rejecting>;
+        function_node_type* start_node = nullptr;
+
+        function_node_type function(g, concurrency_limit,
+            [&](int input) noexcept {
+                if (input == wait_message) {
+                    for (int item : new_work_items) {
+                        start_node->try_put(item);
+                    }
+                }
+                return input;
+            });
+
+        start_node = &function;
+
+        function_node_type writer(g, concurrency_limit,
+            [&](int input) noexcept {
+                processed_items.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::make_edge(function, writer);
+
+        // If the first action is try_put_and_wait, it will occupy concurrency of the function_node
+        // All submits of new_work_items inside of the body should be rejected
+        bool result = function.try_put_and_wait(wait_message);
+        CHECK_MESSAGE(result, "task should not rejected since the node concurrency is not saturated");
+
+        CHECK_MESSAGE(processed_items.size() == 1, nullptr);
+        CHECK_MESSAGE(processed_items[0] == wait_message, "Unexpected items processing");
+
+        g.wait_for_all();
+
+        CHECK_MESSAGE(processed_items.size() == 1, nullptr);
+
+        processed_items.clear();
+
+        // If the first action is try_put, try_put_and_wait is expected to return false since the concurrency of the
+        // node would be saturated
+        function.try_put(0);
+        result = function.try_put_and_wait(wait_message);
+        CHECK_MESSAGE(!result, "task should be rejected since the node concurrency is saturated");
+        CHECK(processed_items.empty());
+
+        g.wait_for_all();
+
+        CHECK(processed_items.size() == 1);
+        CHECK_MESSAGE(processed_items[0] == 0, "Unexpected items processing");
+    });
+}
+
+void test_try_put_and_wait() {
+    test_try_put_and_wait_lightweight(tbb::flow::serial);
+    test_try_put_and_wait_lightweight(tbb::flow::unlimited);
+
+    test_try_put_and_wait_queueing(tbb::flow::serial);
+    test_try_put_and_wait_queueing(tbb::flow::unlimited);
+
+    test_try_put_and_wait_rejecting(tbb::flow::serial);
+}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
 
 //! Test various node bodies with concurrency
 //! \brief \ref error_guessing
@@ -544,3 +799,10 @@ TEST_CASE("constraints for function_node body") {
     static_assert(!can_call_function_node_ctor<input_type, output_type, WrongReturnRoundBrackets<input_type, output_type>>);
 }
 #endif // __TBB_CPP20_CONCEPTS_PRESENT
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test function_node try_put_and_wait") {
+    test_try_put_and_wait();
+}
+#endif
diff --git a/third-party/tbb/test/tbb/test_indexer_node.cpp b/third-party/tbb/test/tbb/test_indexer_node.cpp
index 4ce87e195a..c47a8cad01 100644
--- a/third-party/tbb/test/tbb/test_indexer_node.cpp
+++ b/third-party/tbb/test/tbb/test_indexer_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -661,6 +661,81 @@ void test_deduction_guides() {
 
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+void test_try_put_and_wait() {
+    tbb::task_arena arena(1);
+
+    arena.execute([] {
+        tbb::flow::graph g;
+
+        std::vector<int> start_work_items;
+        std::vector<int> processed_items1;
+        std::vector<float> processed_items2;
+        std::vector<int> new_work_items;
+        int wait_message = 10;
+
+        for (int i = 0; i < wait_message; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message);
+        }
+
+        tbb::flow::indexer_node<int, float> indexer(g);
+        using output_type = decltype(indexer)::output_type;
+
+        tbb::flow::function_node<output_type, int> function(g, tbb::flow::serial,
+            [&](output_type tag_msg) noexcept {
+                if (tag_msg.tag() == 0) {
+                    int input = tag_msg.cast_to<int>();
+                    if (input == wait_message) {
+                        for (auto item : new_work_items) {
+                            tbb::flow::input_port<0>(indexer).try_put(item);
+                            tbb::flow::input_port<1>(indexer).try_put(float(item));
+                        }
+                    }
+                    processed_items1.emplace_back(input);
+                } else {
+                    processed_items2.emplace_back(tag_msg.cast_to<float>());
+                }
+                return 0;
+            });
+
+        tbb::flow::make_edge(indexer, function);
+
+        for (auto item : start_work_items) {
+            tbb::flow::input_port<0>(indexer).try_put(item);
+            tbb::flow::input_port<1>(indexer).try_put(float(item));
+        }
+
+        tbb::flow::input_port<0>(indexer).try_put_and_wait(wait_message);
+
+        // Since function is a serial queueing function node, all start_work_items would be stored in a queue
+        // wait_message would be stored at the end of the queue
+        // During the try_put_and_wait call, start_work_items would be processed from the queue in FIFO order
+        // wait_message would be processed last and adds new_work_items into the same queue
+        // It is expected then new_work_items would be processed during wait_for_all() call
+
+        std::size_t check_index1 = 0;
+        std::size_t check_index2 = 0;
+
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items1[check_index1++] == item, "Unexpected items processing");
+            CHECK_MESSAGE(processed_items2[check_index2++] == float(item), "Unexpected items processing");
+        }
+
+        // wait_message was submitted only to the first port of indexer_node
+        CHECK_MESSAGE(processed_items1[check_index1++] == wait_message, "Unexpected wait_message processing");
+
+        g.wait_for_all();
+
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items1[check_index1++] == item, "Unexpected new_work_items processing");
+            CHECK_MESSAGE(processed_items2[check_index2++] == float(item), "Unexpected new_work_items processing");
+        }
+        CHECK((check_index1 == processed_items1.size() && check_index2 == processed_items2.size()));
+    });
+}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
 //! Serial and parallel test on various tuple sizes
 //! \brief \ref error_guessing
 TEST_CASE("Serial and parallel test") {
@@ -712,3 +787,9 @@ TEST_CASE("Deduction guides") {
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test indexer_node try_put_and_wait") {
+    test_try_put_and_wait();
+}
+#endif
diff --git a/third-party/tbb/test/tbb/test_input_node.cpp b/third-party/tbb/test/tbb/test_input_node.cpp
index f27bf71482..9442693980 100644
--- a/third-party/tbb/test/tbb/test_input_node.cpp
+++ b/third-party/tbb/test/tbb/test_input_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -30,8 +30,8 @@
 //! \brief Test for [flow_graph.input_node] specification
 
 
-using tbb::detail::d1::graph_task;
-using tbb::detail::d1::SUCCESSFULLY_ENQUEUED;
+using tbb::detail::d2::graph_task;
+using tbb::detail::d2::SUCCESSFULLY_ENQUEUED;
 
 const int N = 1000;
 
@@ -61,6 +61,12 @@ class test_push_receiver : public tbb::flow::receiver<T>, utils::NoAssign {
         return const_cast<graph_task*>(SUCCESSFULLY_ENQUEUED);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task( const T& v, const tbb::detail::d2::message_metainfo& ) override {
+        return try_put_task(v);
+    }
+#endif
+
     tbb::flow::graph& graph_reference() const override {
         return my_graph;
     }
diff --git a/third-party/tbb/test/tbb/test_join_node.cpp b/third-party/tbb/test/tbb/test_join_node.cpp
index 2e3af3c547..7f1721e0ee 100644
--- a/third-party/tbb/test/tbb/test_join_node.cpp
+++ b/third-party/tbb/test/tbb/test_join_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -154,3 +154,40 @@ TEST_CASE("Test removal of the predecessor while having none") {
 
     test(connect_join_via_make_edge);
 }
+
+//! \brief \ref error_guessing
+TEST_CASE("Test reservation on the port") {
+    tbb::flow::graph g;
+
+    tbb::flow::buffer_node<int> buffer1(g), buffer2(g);
+    tbb::flow::join_node<std::tuple<int, int>, tbb::flow::reserving> join(g);
+    tbb::flow::buffer_node<std::tuple<int, int>> buffer3(g);
+
+    auto& port0 = tbb::flow::input_port<0>(join);
+    auto& port1 = tbb::flow::input_port<1>(join);
+
+    tbb::flow::make_edge(buffer1, port0);
+    tbb::flow::make_edge(buffer2, port1);
+    tbb::flow::make_edge(join, buffer3);
+
+    int value = -42;
+    bool result = port0.reserve(value);
+    CHECK_MESSAGE(!result, "Incorrect reserve return value");
+
+    result = port1.reserve(value);
+    CHECK_MESSAGE(!result, "Incorrect reserve return value");
+
+    buffer1.try_put(1);
+    g.wait_for_all();
+
+    result = port0.reserve(value);
+    CHECK_MESSAGE(result, "Incorrect reserve return value");
+    CHECK_MESSAGE(value == 1, "Incorrect reserved value");
+    port0.release();
+
+    buffer2.try_put(2);
+    g.wait_for_all();
+
+    result = port1.reserve(value);
+    CHECK_MESSAGE(result, "incorrect reserve return value");
+}
diff --git a/third-party/tbb/test/tbb/test_join_node.h b/third-party/tbb/test/tbb/test_join_node.h
index 8969634e8a..2216310c1a 100644
--- a/third-party/tbb/test/tbb/test_join_node.h
+++ b/third-party/tbb/test/tbb/test_join_node.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -221,7 +221,7 @@ void print_my_value(MyKeySecond<K, V> const &i) {
 
 template<>
 void print_my_value(std::string const &i) {
-    INFO("\"" << i.c_str() << "\"" );
+    INFO("\"" << i << "\"" );
 }
 
 //
@@ -245,10 +245,10 @@ struct my_struct_key<K&, V> {
     }
 };
 
-using tbb::detail::d1::type_to_key_function_body;
-using tbb::detail::d1::hash_buffer;
+using tbb::detail::d2::type_to_key_function_body;
+using tbb::detail::d2::type_to_key_function_body_leaf;
+using tbb::detail::d2::hash_buffer;
 using tbb::detail::d1::tbb_hash_compare;
-using tbb::detail::d1::type_to_key_function_body_leaf;
 
 template<class K, class V> struct VtoKFB {
     typedef type_to_key_function_body<V, K> type;
diff --git a/third-party/tbb/test/tbb/test_join_node_preview.cpp b/third-party/tbb/test/tbb/test_join_node_preview.cpp
index 4bcb1900d6..3ee4075794 100644
--- a/third-party/tbb/test/tbb/test_join_node_preview.cpp
+++ b/third-party/tbb/test/tbb/test_join_node_preview.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2023 Intel Corporation
+    Copyright (c) 2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -82,6 +82,249 @@ void test_follows_and_precedes_api() {
     jn_msg_key_matching_follows_and_precedes();
 }
 
+void test_try_put_and_wait_queueing() {
+    tbb::task_arena arena(1);
+
+    arena.execute([] {
+        tbb::flow::graph g;
+
+        std::vector<int> start_work_items;
+        std::vector<int> processed_items;
+        std::vector<int> new_work_items;
+        int wait_message = 10;
+
+        for (int i = 0; i < wait_message; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message);
+        }
+
+        using tuple_type = std::tuple<int, int, int>;
+        tbb::flow::join_node<tuple_type, tbb::flow::queueing> join(g);
+
+        tbb::flow::function_node<tuple_type, int, tbb::flow::rejecting> function(g, tbb::flow::serial,
+            [&](tuple_type tuple) noexcept {
+                CHECK(std::get<0>(tuple) == std::get<1>(tuple));
+                CHECK(std::get<1>(tuple) == std::get<2>(tuple));
+
+                auto input = std::get<0>(tuple);
+
+                if (input == wait_message) {
+                    for (auto item : new_work_items) {
+                        tbb::flow::input_port<0>(join).try_put(item);
+                        tbb::flow::input_port<1>(join).try_put(item);
+                        tbb::flow::input_port<2>(join).try_put(item);
+                    }
+                }
+                processed_items.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::make_edge(join, function);
+
+        for (auto item : start_work_items) {
+            tbb::flow::input_port<0>(join).try_put(item);
+            tbb::flow::input_port<1>(join).try_put(item);
+            tbb::flow::input_port<2>(join).try_put(item);
+        }
+
+        tbb::flow::input_port<0>(join).try_put(wait_message);
+        tbb::flow::input_port<1>(join).try_put(wait_message);
+        tbb::flow::input_port<2>(join).try_put_and_wait(wait_message);
+
+        // It is expected that the join_node would push the tuple of three copies of first element in start_work_items
+        // And occupy the concurrency of function. Other tuples would be rejected and taken using push-pull protocol
+        // in FIFO order
+        std::size_t check_index = 0;
+
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+        }
+
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected wait_message processing");
+        CHECK_MESSAGE(check_index == processed_items.size(), "Unexpected number of messages");
+
+        g.wait_for_all();
+
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+        }
+    });
+}
+
+void test_try_put_and_wait_reserving() {
+    tbb::task_arena arena(1);
+
+    arena.execute([]{
+        tbb::flow::graph g;
+
+        std::vector<int> start_work_items;
+        std::vector<int> processed_items;
+        std::vector<int> new_work_items;
+        int wait_message = 10;
+
+        for (int i = 0; i < wait_message; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message);
+        }
+
+        using tuple_type = std::tuple<int, int, int>;
+        tbb::flow::queue_node<int> buffer1(g);
+        tbb::flow::queue_node<int> buffer2(g);
+        tbb::flow::queue_node<int> buffer3(g);
+
+        tbb::flow::join_node<tuple_type, tbb::flow::reserving> join(g);
+
+        tbb::flow::function_node<tuple_type, int, tbb::flow::rejecting> function(g, tbb::flow::serial,
+            [&](tuple_type tuple) noexcept {
+                CHECK(std::get<0>(tuple) == std::get<1>(tuple));
+                CHECK(std::get<1>(tuple) == std::get<2>(tuple));
+
+                auto input = std::get<0>(tuple);
+
+                if (input == wait_message) {
+                    for (auto item : new_work_items) {
+                        buffer1.try_put(item);
+                        buffer2.try_put(item);
+                        buffer3.try_put(item);
+                    }
+                }
+                processed_items.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::make_edge(buffer1, tbb::flow::input_port<0>(join));
+        tbb::flow::make_edge(buffer2, tbb::flow::input_port<1>(join));
+        tbb::flow::make_edge(buffer3, tbb::flow::input_port<2>(join));
+        tbb::flow::make_edge(join, function);
+
+        for (auto item : start_work_items) {
+            buffer1.try_put(item);
+            buffer2.try_put(item);
+            buffer3.try_put(item);
+        }
+
+        buffer1.try_put(wait_message);
+        buffer2.try_put(wait_message);
+        buffer3.try_put_and_wait(wait_message);
+
+        // It is expected that the join_node would push the tuple of three copies of first element in start_work_items
+        // And occupy the concurrency of function. Other tuples would be rejected and taken using push-pull protocol
+        // between function and join_node and between join_node and each buffer in FIFO order because queue_node is used
+        std::size_t check_index = 0;
+
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+        }
+
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected wait_message processing");
+        CHECK_MESSAGE(check_index == processed_items.size(), "Unexpected number of messages");
+
+        g.wait_for_all();
+
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+        }
+    });
+}
+
+struct int_wrapper {
+    int i = 0;
+    int_wrapper() : i(0) {}
+    int_wrapper(int ii) : i(ii) {}
+    int_wrapper& operator=(int ii) {
+        i = ii;
+        return *this;
+    }
+
+    int key() const {
+        return i;
+    }
+
+    friend bool operator==(const int_wrapper& lhs, const int_wrapper& rhs) {
+        return lhs.i == rhs.i;
+    }
+};
+
+template <typename... Body>
+void test_try_put_and_wait_key_matching(Body... body) {
+    // Body of one argument for testing standard key_matching
+    // Body of zero arguments for testing message based key_matching
+    static_assert(sizeof...(Body) == 0 || sizeof...(Body) == 1, "incorrect test setup");
+    tbb::task_arena arena(1);
+
+    arena.execute([=] {
+        tbb::flow::graph g;
+
+        std::vector<int_wrapper> start_work_items;
+        std::vector<int_wrapper> processed_items;
+        std::vector<int_wrapper> new_work_items;
+        int_wrapper wait_message = 10;
+
+        for (int i = 0; i < wait_message.i; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message.i);
+        }
+
+        using tuple_type = std::tuple<int_wrapper, int_wrapper, int_wrapper>;
+        tbb::flow::join_node<tuple_type, tbb::flow::key_matching<int>> join(g, body..., body..., body...);
+
+        tbb::flow::function_node<tuple_type, int, tbb::flow::rejecting> function(g, tbb::flow::serial,
+            [&](tuple_type tuple) noexcept {
+                CHECK(std::get<0>(tuple) == std::get<1>(tuple));
+                CHECK(std::get<1>(tuple) == std::get<2>(tuple));
+
+                auto input = std::get<0>(tuple);
+
+                if (input == wait_message) {
+                    for (auto item : new_work_items) {
+                        tbb::flow::input_port<0>(join).try_put(item);
+                        tbb::flow::input_port<1>(join).try_put(item);
+                        tbb::flow::input_port<2>(join).try_put(item);
+                    }
+                }
+                processed_items.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::make_edge(join, function);
+
+        tbb::flow::input_port<0>(join).try_put(wait_message);
+        tbb::flow::input_port<1>(join).try_put(wait_message);
+
+        // For the first port - submit items in reversed order
+        for (std::size_t i = start_work_items.size(); i != 0; --i) {
+            tbb::flow::input_port<0>(join).try_put(start_work_items[i - 1]);
+        }
+
+        // For first two ports - submit items in direct order
+        for (auto item : start_work_items) {
+            tbb::flow::input_port<1>(join).try_put(item);
+            tbb::flow::input_port<2>(join).try_put(item);
+        }
+
+        tbb::flow::input_port<2>(join).try_put_and_wait(wait_message);
+
+        // It is expected that the join_node would push the tuple of three copies of first element in start_work_items
+        // And occupy the concurrency of function. Other tuples would be rejected and taken using push-pull protocol
+        // in order of submission
+        std::size_t check_index = 0;
+
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+        }
+
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected wait_message processing");
+        CHECK_MESSAGE(check_index == processed_items.size(), "Unexpected number of messages");
+
+        g.wait_for_all();
+
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+        }
+        CHECK_MESSAGE(check_index == processed_items.size(), "Unexpected number of messages");
+    });
+}
+
 //! Test follows and precedes API
 //! \brief \ref error_guessing
 TEST_CASE("Test follows and precedes API"){
@@ -101,3 +344,13 @@ TEST_CASE("Test removal of the predecessor while having none") {
     test(connect_join_via_follows);
     test(connect_join_via_precedes);
 }
+
+//! \brief \ref error_guessing
+TEST_CASE("Test join_node try_put_and_wait") {
+    test_try_put_and_wait_queueing();
+    test_try_put_and_wait_reserving();
+    // Test standard key_matching policy
+    test_try_put_and_wait_key_matching([](int_wrapper w) { return w.i; });
+    // Test msg based key_matching policy
+    test_try_put_and_wait_key_matching();
+}
diff --git a/third-party/tbb/test/tbb/test_limiter_node.cpp b/third-party/tbb/test/tbb/test_limiter_node.cpp
index 897f840d36..0bf4912f8a 100644
--- a/third-party/tbb/test/tbb/test_limiter_node.cpp
+++ b/third-party/tbb/test/tbb/test_limiter_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -38,8 +38,8 @@
 const int L = 10;
 const int N = 1000;
 
-using tbb::detail::d1::SUCCESSFULLY_ENQUEUED;
-using tbb::detail::d1::graph_task;
+using tbb::detail::d2::SUCCESSFULLY_ENQUEUED;
+using tbb::detail::d2::graph_task;
 
 template< typename T >
 struct serial_receiver : public tbb::flow::receiver<T>, utils::NoAssign {
@@ -53,6 +53,12 @@ struct serial_receiver : public tbb::flow::receiver<T>, utils::NoAssign {
        return const_cast<graph_task*>(SUCCESSFULLY_ENQUEUED);
    }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task * try_put_task( const T &v, const tbb::detail::d2::message_metainfo& ) override {
+        return try_put_task(v);
+    }
+#endif
+
     tbb::flow::graph& graph_reference() const override {
         return my_graph;
     }
@@ -71,6 +77,12 @@ struct parallel_receiver : public tbb::flow::receiver<T>, utils::NoAssign {
        return const_cast<graph_task*>(SUCCESSFULLY_ENQUEUED);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task * try_put_task( const T &v, const tbb::detail::d2::message_metainfo& ) override {
+        return try_put_task(v);
+    }
+#endif
+
     tbb::flow::graph& graph_reference() const override {
         return my_graph;
     }
@@ -534,6 +546,67 @@ void test_decrement_while_try_put_task() {
     CHECK_MESSAGE(processed.load() == threshold, "decrementer terminate flow graph work");
 }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+void test_try_put_and_wait() {
+    tbb::task_arena arena(1);
+
+    arena.execute([] {
+        tbb::flow::graph g;
+
+        std::vector<int> start_work_items;
+        std::vector<int> processed_items;
+        std::vector<int> new_work_items;
+        int wait_message = 10;
+
+        for (int i = 0; i < wait_message; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message);
+        }
+
+        std::size_t threshold = start_work_items.size() + 1;
+        CHECK_MESSAGE(new_work_items.size() < threshold, "Incorrect test setup");
+
+        tbb::flow::limiter_node<int> limiter(g, threshold);
+        tbb::flow::function_node<int, tbb::flow::continue_msg> function(g, tbb::flow::serial,
+            [&](int input) {
+                if (input == wait_message) {
+                    for (auto item : new_work_items) {
+                        limiter.try_put(item);
+                    }
+                }
+                processed_items.emplace_back(input);
+            });
+
+        tbb::flow::make_edge(limiter, function);
+        tbb::flow::make_edge(function, limiter.decrementer());
+
+        for (auto item : start_work_items) {
+            limiter.try_put(item);
+        }
+
+        limiter.try_put_and_wait(wait_message);
+
+        // Since function is a serial queueing function_node, all start_work_items would be added to the queue
+        // and processed in FIFO order. wait_message would be added and processed last. Each item in start_work_items
+        // should put an item to a decrementer edge and hence new_work_items should not be missed as well
+
+        std::size_t check_index = 0;
+
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+        }
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected wait_message processing");
+        CHECK_MESSAGE(check_index == processed_items.size(), "Unexpected number of messages");
+
+        g.wait_for_all();
+
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected new_work_items processing");
+        }
+        CHECK(check_index == processed_items.size());
+    });
+}
+#endif
 
 //! Test puts on limiter_node with decrements and varying parallelism levels
 //! \brief \ref error_guessing
@@ -623,3 +696,10 @@ TEST_CASE("Test correct node deallocation while using small_object_pool") {
     tbb::task_scheduler_handle handle{ tbb::attach{} };
     tbb::finalize( handle, std::nothrow );
 }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test limiter_node try_put_and_wait") {
+    test_try_put_and_wait();
+}
+#endif
diff --git a/third-party/tbb/test/tbb/test_overwrite_node.cpp b/third-party/tbb/test/tbb/test_overwrite_node.cpp
index 127cca2d15..3f5ed8fec0 100644
--- a/third-party/tbb/test/tbb/test_overwrite_node.cpp
+++ b/third-party/tbb/test/tbb/test_overwrite_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 #include "common/graph_utils.h"
 #include "common/test_follows_and_precedes_api.h"
 
+#include "test_buffering_try_put_and_wait.h"
 
 //! \file test_overwrite_node.cpp
 //! \brief Test for [flow_graph.overwrite_node] specification
@@ -183,6 +184,165 @@ void test_deduction_guides() {
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+void test_overwrite_node_try_put_and_wait() {
+    using namespace test_try_put_and_wait;
+
+    std::vector<int> start_work_items;
+    std::vector<int> new_work_items;
+    int wait_message = 10;
+
+    for (int i = 0; i < wait_message; ++i) {
+        start_work_items.emplace_back(i);
+        new_work_items.emplace_back(i + 1 + wait_message);
+    }
+
+    // Test push
+    {
+        std::vector<int> processed_items;
+
+        // Returns the index from which wait_for_all processing started
+        std::size_t after_start = test_buffer_push<tbb::flow::overwrite_node<int>>(start_work_items, wait_message,
+                                                                                   new_work_items, processed_items);
+
+        // It is expected that try_put_and_wait would process start_work_items (FIFO) and the wait_message
+        // and new_work_items (FIFO) would be processed in wait_for_all
+
+        CHECK_MESSAGE(after_start - 1 == start_work_items.size() + 1,
+                      "incorrect number of items processed by try_put_and_wait");
+        std::size_t check_index = 0;
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "unexpected start_work_items processing");
+        }
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "unexpected wait_message processing");
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "unexpected new_work_items processing");
+        }
+    }
+    // Test pull
+    {
+        tbb::task_arena arena(1);
+
+        arena.execute([&] {
+            std::vector<int> processed_items;
+
+            tbb::flow::graph g;
+            tbb::flow::overwrite_node<int> buffer(g);
+            int start_message = 0;
+            int new_message = 1;
+
+            using function_node_type = tbb::flow::function_node<int, int, tbb::flow::rejecting>;
+
+            function_node_type function(g, tbb::flow::serial,
+                [&](int input) {
+                    if (input == wait_message) {
+                        buffer.try_put(new_message);
+                    }
+
+                    // Explicitly clean the buffer to prevent infinite try_get by the function_node
+                    if (input == new_message) {
+                        buffer.clear();
+                    }
+
+                    processed_items.emplace_back(input);
+                    return 0;
+                });
+
+            tbb::flow::make_edge(buffer, function);
+
+            buffer.try_put(start_message); // Occupies concurrency of function
+
+            buffer.try_put_and_wait(wait_message);
+
+            CHECK_MESSAGE(processed_items.size() == 2, "only the start_message and wait_message should be processed");
+            std::size_t check_index = 0;
+            CHECK_MESSAGE(processed_items[check_index++] == start_message, "unexpected start_message processing");
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message, "unexpected wait_message processing");
+
+            g.wait_for_all();
+
+            CHECK_MESSAGE(processed_items[check_index++] == new_message, "unexpected new_message processing");
+            CHECK(check_index == processed_items.size());
+        });
+    }
+    // Test reserve
+    {
+        tbb::task_arena arena(1);
+
+        arena.execute([&] {
+            std::vector<int> processed_items;
+
+            tbb::flow::graph g;
+            tbb::flow::overwrite_node<int> buffer(g);
+            tbb::flow::limiter_node<int, int> limiter(g, 1);
+            int start_message = 0;
+            int new_message = 1;
+
+            using function_node_type = tbb::flow::function_node<int, int, tbb::flow::rejecting>;
+
+            function_node_type function(g, tbb::flow::serial,
+                [&](int input) {
+                    if (input == wait_message) {
+                        buffer.try_put(new_message);
+                    }
+
+                    // Explicitly clean the buffer to prevent infinite try_get by the function_node
+                    if (input == new_message) {
+                        buffer.clear();
+                    }
+
+                    processed_items.emplace_back(input);
+                    limiter.decrementer().try_put(1);
+                    return 0;
+                });
+
+            tbb::flow::make_edge(buffer, limiter);
+            tbb::flow::make_edge(limiter, function);
+
+            buffer.try_put(start_message); // Occupies concurrency of function
+
+            buffer.try_put_and_wait(wait_message);
+
+            CHECK_MESSAGE(processed_items.size() == 2, "only the start_message and wait_message should be processed");
+            std::size_t check_index = 0;
+            CHECK_MESSAGE(processed_items[check_index++] == start_message, "unexpected start_message processing");
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message, "unexpected wait_message processing");
+
+            g.wait_for_all();
+
+            CHECK_MESSAGE(processed_items[check_index++] == new_message, "unexpected new_message processing");
+            CHECK(check_index == processed_items.size());
+        });
+    }
+    // Test explicit clear
+    {
+        tbb::flow::graph g;
+        tbb::flow::overwrite_node<int> buffer(g);
+
+        std::vector<int> processed_items;
+
+        tbb::flow::function_node<int, int> f(g, tbb::flow::serial,
+            [&](int input) {
+                processed_items.emplace_back(input);
+                buffer.clear();
+                return 0;
+            });
+
+        tbb::flow::make_edge(buffer, f);
+
+        buffer.try_put_and_wait(wait_message);
+
+        CHECK_MESSAGE(processed_items.size() == 1, "Incorrect number of processed items");
+        CHECK_MESSAGE(processed_items.back() == wait_message, "unexpected processing");
+
+        g.wait_for_all();
+
+        CHECK(processed_items.size() == 1);
+        CHECK(processed_items.back() == wait_message);
+    }
+}
+#endif
+
 //! Test read-write properties
 //! \brief \ref requirement \ref error_guessing
 TEST_CASE("Read-write"){
@@ -256,3 +416,10 @@ TEST_CASE("Cancel register_predecessor_task") {
     // Wait for cancellation of spawned tasks
     g.wait_for_all();
 }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test overwrite_node try_put_and_wait") {
+    test_overwrite_node_try_put_and_wait();
+}
+#endif
diff --git a/third-party/tbb/test/tbb/test_partitioner.cpp b/third-party/tbb/test/tbb/test_partitioner.cpp
index 9af5009dad..e0fb98fc28 100644
--- a/third-party/tbb/test/tbb/test_partitioner.cpp
+++ b/third-party/tbb/test/tbb/test_partitioner.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2021-2023 Intel Corporation
+    Copyright (c) 2021-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include "tbb/parallel_for.h"
 #include "tbb/task_arena.h"
+#include "tbb/task_scheduler_observer.h"
 #include "tbb/global_control.h"
 #include "oneapi/tbb/mutex.h"
 
@@ -36,10 +37,33 @@
 
 namespace task_affinity_retention {
 
+class leaving_observer : public tbb::task_scheduler_observer {
+    std::atomic<int> my_thread_count{};
+public:
+    leaving_observer(tbb::task_arena& a) : tbb::task_scheduler_observer(a) {
+        observe(true);
+    }
+
+    void on_scheduler_entry(bool) override {
+        ++my_thread_count;
+    }
+
+    void on_scheduler_exit(bool) override {
+        --my_thread_count;
+    }
+
+    void wait_leave() {
+        while (my_thread_count.load() != 0) {
+            std::this_thread::yield();
+        }
+    }
+};
+
 template <typename PerBodyFunc> float test(PerBodyFunc&& body) {
     const std::size_t num_threads = 2 * utils::get_platform_max_threads();
     tbb::global_control concurrency(tbb::global_control::max_allowed_parallelism, num_threads);
     tbb::task_arena big_arena(static_cast<int>(num_threads));
+    leaving_observer observer(big_arena);
 
 #if __TBB_USE_THREAD_SANITIZER
     // Reduce execution time under Thread Sanitizer
@@ -77,8 +101,10 @@ template <typename PerBodyFunc> float test(PerBodyFunc&& body) {
                 tbb::static_partitioner()
             );
         });
-        // TODO:
-        //   - Consider introducing an observer to guarantee the threads left the arena.
+        // To avoid tasks stealing in the beginning of the parallel algorithm, the test waits for
+        // the threads to leave the arena, so that on the next iteration they have tasks assigned
+        // in their mailboxes and, thus, don't need to search for work to do in other task pools.
+        observer.wait_leave();
     }
 
     std::size_t range_shifts = 0;
@@ -142,12 +168,15 @@ void strict_test() {
 
 } // namespace task_affinity_retention
 
+// global_control::max_allowed_parallelism functionality is not covered by TCM
+#if !__TBB_TCM_TESTING_ENABLED
 //! Testing affinitized tasks are not stolen
 //! \brief \ref error_guessing
 TEST_CASE("Threads respect task affinity") {
     task_affinity_retention::relaxed_test();
     task_affinity_retention::strict_test();
 }
+#endif
 
 template <typename Range>
 void test_custom_range(int diff_mult) {
diff --git a/third-party/tbb/test/tbb/test_priority_queue_node.cpp b/third-party/tbb/test/tbb/test_priority_queue_node.cpp
index d14aa4bbb3..18a60eb935 100644
--- a/third-party/tbb/test/tbb/test_priority_queue_node.cpp
+++ b/third-party/tbb/test/tbb/test_priority_queue_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -30,6 +30,7 @@
 
 #include <cstdio>
 
+#include "test_buffering_try_put_and_wait.h"
 
 //! \file test_priority_queue_node.cpp
 //! \brief Test for [flow_graph.priority_queue_node] specification
@@ -378,6 +379,166 @@ void test_deduction_guides() {
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+void test_pqueue_node_try_put_and_wait() {
+    using namespace test_try_put_and_wait;
+
+    std::vector<int> start_work_items;
+    std::vector<int> new_work_items;
+    int wait_message = -10;
+
+    for (int i = 0; i < 10; ++i) {
+        start_work_items.emplace_back(i);
+        new_work_items.emplace_back(i + 1 + wait_message);
+    }
+
+    // Test push
+    // test_buffer_push tests the graph
+    // buffer1 -> function -> buffer2 -> writer
+    //     function is a queueing serial function_node that submits new_work_items once wait_message arrives
+    //     writer is an unlimited function_node that writes an item into the processed_items vector
+    // Test steps
+    //     1. push start_work_items into the buffer1
+    //     2. buffer1.try_put_and_wait(wait_message);
+    //     3. g.wait_for_all()
+    // test_buffer_push returns the index from which the items processed during wait_for_all() starts
+    {
+        std::vector<int> processed_items;
+
+        std::size_t after_start = test_buffer_push<tbb::flow::priority_queue_node<int>>(start_work_items, wait_message,
+                                                                                        new_work_items, processed_items);
+
+        // Expected effect:
+        // During buffer1.try_put_and_wait()
+        //     1. start_work_items would be pushed to buffer1
+        //     2. wait_message would be pushed to buffer1
+        //     3. forward_task on buffer1 would transfer start_work_items into the function_node in LIFO order
+        //     4. wait_message would be transferred last because of lowest priority
+        //     5. the first item would occupy concurrency of function, other items would be pushed to the queue
+        //     6. function would process start_work_items and push them to the buffer2
+        //     7. wait_message would be processed last and add new_work_items to buffer1
+        //     8. forward_task on buffer2 would transfer start_work_items in FIFO order and the wait_message to the writer
+        //     9.  try_put_and_wait exits since wait_message is completed
+        // During g.wait_for_all()
+        //     10. forward_task for new_work_items in buffer1 would be spawned and put items in function in LIFO order
+        // Expected items processing - { start_work_items LIFO, wait_message, new_work_items LIFO }
+
+        std::size_t check_index = 0;
+        CHECK_MESSAGE(after_start == start_work_items.size() + 1,
+                      "try_put_and_wait should process start_work_items and the wait_message");
+        for (std::size_t i = start_work_items.size(); i != 0; --i) {
+            CHECK_MESSAGE(processed_items[check_index++] == start_work_items[i - 1],
+                          "try_put_and_wait should process start_work_items in LIFO order");
+        }
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message,
+                      "try_put_and_wait should process wait_message after start_work_items");
+
+        for (std::size_t i = new_work_items.size(); i != 0; --i) {
+            CHECK_MESSAGE(processed_items[check_index++] == new_work_items[i - 1],
+                          "wait_for_all should process new_work_items in LIFO order");
+        }
+        CHECK(check_index == processed_items.size());
+    } // Test push
+
+    // Test pull
+    // test_buffer_pull tests the graph
+    // buffer -> function
+    //     function is a rejecting serial function_node that submits new_work_items once wait_message arrives
+    //     and writes the processed item into the processed_items
+    // Test steps
+    //     1. push the occupier message to the function
+    //     2. push start_work_items into the buffer
+    //     3. buffer.try_put_and_wait(wait_message)
+    //     4. g.wait_for_all()
+    // test_buffer_pull returns the index from which the items processed during wait_for_all() starts
+
+    {
+        std::vector<int> processed_items;
+        int occupier = 42;
+
+        std::size_t after_start = test_buffer_pull<tbb::flow::priority_queue_node<int>>(start_work_items, wait_message, occupier,
+                                                                                        new_work_items, processed_items);
+
+        // Expected effect
+        // 0. task for occupier processing would be spawned by the function
+        // During buffer.try_put_and_wait()
+        //     1. start_work_items would be pushed to the buffer
+        //     2. wait_message would be pushed to the buffer
+        //     3. forward_task would try to push items to the function, but would fail
+        //        and set the edge to the pull state
+        //     4. occupier would be processed
+        //     5. items would be taken from the buffer by function in the priority (LIFO)  order
+        //     6. wait_message would be taken last due to lowest priority
+        //     7. new_work_items would be pushed to the buffer while processing wait_message
+        // During wait_for_all()
+        //     8. new_work_items would be taken from the buffer in the priority (LIFO) order
+        // Expected items processing { occupier, start_work_items LIFO, wait_message, new_work_items LIFO }
+
+        std::size_t check_index = 0;
+        CHECK_MESSAGE(after_start == start_work_items.size() + 2,
+                      "try_put_and_wait should process start_work_items, occupier and the wait_message");
+        CHECK_MESSAGE(processed_items[check_index++] == occupier, "try_put_and_wait should process the occupier");
+        for (std::size_t i = start_work_items.size(); i != 0; --i) {
+            CHECK_MESSAGE(processed_items[check_index++] == start_work_items[i - 1],
+                          "try_put_and_wait should process start_work_items in LIFO order");
+        }
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message,
+                      "try_put_and_wait should process wait_message after start_work_items");
+
+        for (std::size_t i = new_work_items.size(); i != 0; --i) {
+            CHECK_MESSAGE(processed_items[check_index++] == new_work_items[i - 1],
+                          "wait_for_all should process new_work_items in LIFO order");
+        }
+        CHECK(check_index == processed_items.size());
+    }
+
+    // Test reserve
+    {
+        int thresholds[] = { 1, 2 };
+
+        for (int threshold : thresholds) {
+            std::vector<int> processed_items;
+
+            // test_buffer_reserve tests the following graph
+            // buffer -> limiter -> function
+            //  function is a rejecting serial function_node that puts an item to the decrementer port
+            //  of the limiter inside of the body
+
+            std::size_t after_start = test_buffer_reserve<tbb::flow::priority_queue_node<int>>(threshold,
+                start_work_items, wait_message, new_work_items, processed_items);
+
+            // Expected effect:
+            // 1. start_work_items would be pushed to the buffer
+            // 2. wait_message_would be pushed to the buffer
+            // 3. forward task of the buffer would push the first message to the limiter node.
+            //    Since the limiter threshold is not reached, it would be directly passed to the function
+            // 4. function would spawn the task for the first message processing
+            // 5. the first would be processed
+            // 6. decrementer.try_put() would be called and the limiter node would
+            //    process all of the items from the buffer using the try_reserve/try_consume/try_release semantics
+            //    in the priority (greatest first) order
+            // 7. When the wait_message would be taken from the queue, the try_put_and_wait would exit
+
+            std::size_t check_index = 0;
+
+            CHECK_MESSAGE(after_start == start_work_items.size() + 1,
+                          "try_put_and_wait should start_work_items and wait_message");
+            for (std::size_t index = start_work_items.size(); index != 0; --index) {
+                CHECK_MESSAGE(processed_items[check_index++] == start_work_items[index - 1],
+                              "Unexpected start_work_items processing");
+            }
+
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected wait_message processing");
+
+            for (std::size_t index = new_work_items.size(); index != 0; --index) {
+                CHECK_MESSAGE(processed_items[check_index++] == new_work_items[index - 1],
+                              "Unexpected new_work_items processing");
+            }
+        }
+    }
+}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
 //! Test serial, parallel behavior and reservation under parallelism
 //! \brief \ref requirement \ref error_guessing
 TEST_CASE("Serial, parallel and reservation tests"){
@@ -419,3 +580,9 @@ TEST_CASE("Test deduction guides"){
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test priority_queue_node try_put_and_wait") {
+    test_pqueue_node_try_put_and_wait();
+}
+#endif
diff --git a/third-party/tbb/test/tbb/test_queue_node.cpp b/third-party/tbb/test/tbb/test_queue_node.cpp
index e034ef6645..546b47edae 100644
--- a/third-party/tbb/test/tbb/test_queue_node.cpp
+++ b/third-party/tbb/test/tbb/test_queue_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -30,6 +30,7 @@
 
 #include <cstdio>
 
+#include "test_buffering_try_put_and_wait.h"
 
 //! \file test_queue_node.cpp
 //! \brief Test for [flow_graph.queue_node] specification
@@ -494,6 +495,162 @@ void test_deduction_guides() {
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+void test_queue_node_try_put_and_wait() {
+    using namespace test_try_put_and_wait;
+
+    std::vector<int> start_work_items;
+    std::vector<int> new_work_items;
+    int wait_message = 10;
+
+    for (int i = 0; i < wait_message; ++i) {
+        start_work_items.emplace_back(i);
+        new_work_items.emplace_back(i + 1 + wait_message);
+    }
+
+    // Test push
+    // test_buffer_push tests the graph
+    // buffer1 -> function -> buffer2 -> writer
+    //     function is a queueing serial function_node that submits new_work_items once wait_message arrives
+    //     writer is an unlimited function_node that writes an item into the processed_items vector
+    // Test steps
+    //     1. push start_work_items into the buffer1
+    //     2. buffer1.try_put_and_wait(wait_message);
+    //     3. g.wait_for_all()
+    // test_buffer_push returns the index from which the items processed during wait_for_all() starts
+    {
+        std::vector<int> processed_items;
+
+        std::size_t after_start = test_buffer_push<tbb::flow::queue_node<int>>(start_work_items, wait_message,
+                                                                               new_work_items, processed_items);
+
+        // Expected effect:
+        // During buffer1.try_put_and_wait()
+        //     1. start_work_items would be pushed to buffer1
+        //     2. wait_message would be pushed to buffer1
+        //     3. forward_task on buffer1 would transfer all of the items to the function_node in FIFO order
+        //     4. the first item would occupy concurrency of function, other items would be pushed to the queue
+        //     5. function would process start_work_items and push them to the buffer2
+        //     6. wait_message would be processed last and add new_work_items to buffer1
+        //     7. forward_task on buffer2 would transfer start_work_items in FIFO order and the wait_message to the writer
+        //     8.  try_put_and_wait exits since wait_message is completed
+        // During g.wait_for_all()
+        //     10. forward_task for new_work_items in buffer1 would be spawned and put items in function in FIFO order
+        //     11. function_node would process and push forward items from the queue in FIFO order
+        // Expected items processing - { start_work_items FIFO, wait_message, new_work_items FIFO }
+
+        std::size_t check_index = 0;
+        CHECK_MESSAGE(after_start == start_work_items.size() + 1,
+                      "try_put_and_wait should process start_work_items and the wait_message");
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item,
+                          "try_put_and_wait should process start_work_items FIFO");
+        }
+
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message,
+                      "try_put_and_wait should process wait_message after start_work_items");
+
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item,
+                          "wait_for_all should process new_work_items FIFO");
+        }
+        CHECK(check_index == processed_items.size());
+    } // Test push
+
+    // Test pull
+    // test_buffer_pull tests the graph
+    // buffer -> function
+    //     function is a rejecting serial function_node that submits new_work_items once wait_message arrives
+    //     and writes the processed item into the processed_items
+    // Test steps
+    //     1. push the occupier message to the function
+    //     2. push start_work_items into the buffer
+    //     3. buffer.try_put_and_wait(wait_message)
+    //     4. g.wait_for_all()
+    // test_buffer_pull returns the index from which the items processed during wait_for_all() starts
+
+    {
+        std::vector<int> processed_items;
+        int occupier = 42;
+
+        std::size_t after_start = test_buffer_pull<tbb::flow::queue_node<int>>(start_work_items, wait_message, occupier,
+                                                                               new_work_items, processed_items);
+
+        // Expected effect
+        // 0. task for occupier processing would be spawned by the function
+        // During buffer.try_put_and_wait()
+        //     1. start_work_items would be pushed to the buffer
+        //     2. wait_message would be pushed to the buffer
+        //     3. forward_task would try to push items to the function, but would fail
+        //        and set the edge to the pull state
+        //     4. occupier would be processed
+        //     5. items would be taken from the buffer by function in FIFO order
+        //     6. wait_message would be taken last and push new_work_items to the buffer
+        // During wait_for_all()
+        //     7. new_work_items would be taken from the buffer in FIFO order
+        // Expected items processing { occupier, start_work_items FIFO, wait_message, new_work_items FIFO }
+
+        std::size_t check_index = 0;
+
+        CHECK_MESSAGE(after_start == start_work_items.size() + 2,
+                      "start_work_items, occupier and wait_message should be processed by try_put_and_wait");
+        CHECK_MESSAGE(processed_items[check_index++] == occupier, "Unexpected items processing by try_put_and_wait");
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item,
+                          "try_put_and_wait should process start_work_items FIFO");
+        }
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected items processing by try_put_and_wait");
+
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item,
+                          "try_put_and_wait should process new_work_items FIFO");
+        }
+        CHECK(check_index == processed_items.size());
+    }
+
+    // Test reserve
+    {
+        int thresholds[] = { 1, 2 };
+
+        for (int threshold : thresholds) {
+            std::vector<int> processed_items;
+
+            // test_buffer_reserve tests the following graph
+            // buffer -> limiter -> function
+            //  function is a rejecting serial function_node that puts an item to the decrementer port
+            //  of the limiter inside of the body
+
+            std::size_t after_start = test_buffer_reserve<tbb::flow::queue_node<int>>(threshold,
+                start_work_items, wait_message, new_work_items, processed_items);
+
+            // Expected effect:
+            // 1. start_work_items would be pushed to the buffer
+            // 2. wait_message_would be pushed to the buffer
+            // 3. forward task of the buffer would push the first message to the limiter node.
+            //    Since the limiter threshold is not reached, it would be directly passed to the function
+            // 4. function would spawn the task for the first message processing
+            // 5. the first would be processed
+            // 6. decrementer.try_put() would be called and the limiter node would
+            //    process all of the items from the buffer using the try_reserve/try_consume/try_release semantics
+            // 7. When the wait_message would be taken from the queue, the try_put_and_wait would exit
+
+            std::size_t check_index = 0;
+
+            CHECK_MESSAGE(after_start == start_work_items.size() + 1,
+                          "try_put_and_wait should start_work_items and wait_message");
+            for (auto item : start_work_items) {
+                CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+            }
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected wait_message processing");
+
+            for (auto item : new_work_items) {
+                CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+            }
+        }
+    }
+}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
 //! Test serial, parallel behavior and reservation under parallelism
 //! \brief \ref requirement \ref error_guessing
 TEST_CASE("Parallel, serial test"){
@@ -559,3 +716,10 @@ TEST_CASE("queue_node with reservation"){
     CHECK_MESSAGE((out_arg == -1), "Getting from reserved node should not update its argument.");
     g.wait_for_all();
 }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test queue_node try_put_and_wait") {
+    test_queue_node_try_put_and_wait();
+}
+#endif
diff --git a/third-party/tbb/test/tbb/test_sequencer_node.cpp b/third-party/tbb/test/tbb/test_sequencer_node.cpp
index 564721f682..1e6494d69b 100644
--- a/third-party/tbb/test/tbb/test_sequencer_node.cpp
+++ b/third-party/tbb/test/tbb/test_sequencer_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -28,6 +28,7 @@
 #include <cstdio>
 #include <atomic>
 
+#include "test_buffering_try_put_and_wait.h"
 
 //! \file test_sequencer_node.cpp
 //! \brief Test for [flow_graph.sequencer_node] specification
@@ -437,6 +438,169 @@ void test_deduction_guides() {
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+void test_seq_node_try_put_and_wait() {
+    using namespace test_try_put_and_wait;
+
+    std::vector<int> start_work_items;
+    std::vector<int> new_work_items;
+    int wait_message = 10;
+
+    for (int i = 0; i < wait_message; ++i) {
+        start_work_items.emplace_back(i);
+        new_work_items.emplace_back(i + 1 + wait_message);
+    }
+
+    auto simple_sequencer = [](int item) { return item; };
+
+    // Test push
+    // test_buffer_push tests the graph
+    // buffer1 -> function -> buffer2 -> writer
+    //     function is a queueing serial function_node that submits new_work_items once wait_message arrives
+    //     writer is an unlimited function_node that writes an item into the processed_items vector
+    // Test steps
+    //     1. push start_work_items into the buffer1
+    //     2. buffer1.try_put_and_wait(wait_message);
+    //     3. g.wait_for_all()
+    // test_buffer_push returns the index from which the items processed during wait_for_all() starts
+    {
+        std::vector<int> processed_items;
+
+        std::size_t after_start = test_buffer_push<tbb::flow::sequencer_node<int>>(start_work_items, wait_message,
+                                                                                   new_work_items, processed_items,
+                                                                                   simple_sequencer);
+
+        // Expected effect:
+        // During buffer1.try_put_and_wait()
+        //     1. start_work_items would be pushed to buffer1
+        //     2. wait_message would be pushed to buffer1
+        //     3. forward_task on buffer1 would transfer all of the items to the function_node in sequencer order (FIFO)
+        //     4. the first item would occupy concurrency of function, other items would be pushed to the queue
+        //     5. function would process start_work_items and push them to the buffer2
+        //     6. wait_message would be processed last and add new_work_items to buffer1
+        //     7. forward_task on buffer2 would transfer start_work_items in sequencer (FIFO) order and the wait_message to the writer
+        //     8.  try_put_and_wait exits since wait_message is completed
+        // During g.wait_for_all()
+        //     10. forward_task for new_work_items in buffer1 would be spawned and put items in function in FIFO order
+        //     11. function_node would process and push forward items from the queue in FIFO order
+        // Expected items processing - { start_work_items FIFO, wait_message, new_work_items FIFO }
+
+        std::size_t check_index = 0;
+        CHECK_MESSAGE(after_start == start_work_items.size() + 1,
+                      "try_put_and_wait should process start_work_items and the wait_message");
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item,
+                          "try_put_and_wait should process start_work_items FIFO");
+        }
+
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message,
+                      "try_put_and_wait should process wait_message after start_work_items");
+
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item,
+                          "wait_for_all should process new_work_items FIFO");
+        }
+        CHECK(check_index == processed_items.size());
+    } // Test push
+
+    // Test pull
+    // test_buffer_pull tests the graph
+    // buffer -> function
+    //     function is a rejecting serial function_node that submits new_work_items once wait_message arrives
+    //     and writes the processed item into the processed_items
+    // Test steps
+    //     1. push the occupier message to the function
+    //     2. push start_work_items into the buffer
+    //     3. buffer.try_put_and_wait(wait_message)
+    //     4. g.wait_for_all()
+    // test_buffer_pull returns the index from which the items processed during wait_for_all() starts
+
+    {
+        std::vector<int> processed_items;
+        int occupier = 42;
+
+        std::size_t after_start = test_buffer_pull<tbb::flow::sequencer_node<int>>(start_work_items, wait_message, occupier,
+                                                                                   new_work_items, processed_items,
+                                                                                   simple_sequencer);
+
+        // Expected effect
+        // 0. task for occupier processing would be spawned by the function
+        // During buffer.try_put_and_wait()
+        //     1. start_work_items would be pushed to the buffer
+        //     2. wait_message would be pushed to the buffer
+        //     3. forward_task would try to push items to the function, but would fail
+        //        and set the edge to the pull state
+        //     4. occupier would be processed
+        //     5. items would be taken from the buffer by function in FIFO order
+        //     6. wait_message would be taken last and push new_work_items to the buffer
+        // During wait_for_all()
+        //     7. new_work_items would be taken from the buffer in FIFO order
+        // Expected items processing { occupier, start_work_items FIFO, wait_message, new_work_items FIFO }
+
+        std::size_t check_index = 0;
+
+        CHECK_MESSAGE(after_start == start_work_items.size() + 2,
+                      "start_work_items, occupier and wait_message should be processed by try_put_and_wait");
+        CHECK_MESSAGE(processed_items[check_index++] == occupier, "Unexpected items processing by try_put_and_wait");
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item,
+                          "try_put_and_wait should process start_work_items FIFO");
+        }
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected items processing by try_put_and_wait");
+
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item,
+                          "try_put_and_wait should process new_work_items FIFO");
+        }
+        CHECK(check_index == processed_items.size());
+    }
+
+    // Test reserve
+    {
+        int thresholds[] = { 1, 2 };
+
+        for (int threshold : thresholds) {
+            std::vector<int> processed_items;
+
+            // test_buffer_reserve tests the following graph
+            // buffer -> limiter -> function
+            //  function is a rejecting serial function_node that puts an item to the decrementer port
+            //  of the limiter inside of the body
+
+            std::size_t after_start = test_buffer_reserve<tbb::flow::sequencer_node<int>>(threshold,
+                start_work_items, wait_message, new_work_items, processed_items, simple_sequencer);
+
+            // Expected effect:
+            // 1. start_work_items would be pushed to the buffer
+            // 2. wait_message_would be pushed to the buffer
+            // 3. forward task of the buffer would push the first message to the limiter node.
+            //    Since the limiter threshold is not reached, it would be directly passed to the function
+            // 4. function would spawn the task for the first message processing
+            // 5. the first would be processed
+            // 6. decrementer.try_put() would be called and the limiter node would
+            //    process all of the items from the buffer using the try_reserve/try_consume/try_release semantics
+            // 7. When the wait_message would be taken from the buffer, the try_put_and_wait would exit
+
+            std::size_t check_index = 0;
+
+            CHECK_MESSAGE(after_start == start_work_items.size() + 1,
+                      "start_work_items, occupier and wait_message should be processed by try_put_and_wait");
+            for (auto item : start_work_items) {
+                CHECK_MESSAGE(processed_items[check_index++] == item,
+                            "try_put_and_wait should process start_work_items FIFO");
+            }
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected items processing by try_put_and_wait");
+
+            for (auto item : new_work_items) {
+                CHECK_MESSAGE(processed_items[check_index++] == item,
+                            "try_put_and_wait should process new_work_items FIFO");
+            }
+            CHECK(check_index == processed_items.size());
+        }
+    }
+}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
 //! Test sequencer with various request orders and parallelism levels
 //! \brief \ref requirement \ref error_guessing
 TEST_CASE("Serial and parallel test"){
@@ -501,3 +665,10 @@ TEST_CASE("constraints for sequencer_node sequencer") {
     static_assert(!can_call_sequencer_node_ctor<type, WrongReturnOperatorRoundBrackets<type>>);
 }
 #endif // __TBB_CPP20_CONCEPTS_PRESENT
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test sequencer_node try_put_and_wait") {
+    test_seq_node_try_put_and_wait();
+}
+#endif
diff --git a/third-party/tbb/test/tbb/test_split_node.cpp b/third-party/tbb/test/tbb/test_split_node.cpp
index e791b546b5..1e03be0dab 100644
--- a/third-party/tbb/test/tbb/test_split_node.cpp
+++ b/third-party/tbb/test/tbb/test_split_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -397,6 +397,83 @@ void test_deduction_guides() {
 
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+void test_try_put_and_wait() {
+    tbb::task_arena arena(1);
+
+    arena.execute([] {
+        tbb::flow::graph g;
+
+        std::vector<int> start_work_items;
+        std::vector<int> processed_items1;
+        std::vector<int> processed_items2;
+        std::vector<int> new_work_items;
+        int wait_message = 10;
+
+        for (int i = 0; i < wait_message; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message);
+        }
+
+        using tuple_type = std::tuple<int, int>;
+        tbb::flow::split_node<tuple_type> split(g);
+
+        tbb::flow::function_node<int, int> function1(g, tbb::flow::unlimited,
+            [&](int input) noexcept {
+                if (input == wait_message) {
+                    for (int item : new_work_items) {
+                        split.try_put(tuple_type{item, item});
+                    }
+                }
+                processed_items1.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::function_node<int, int> function2(g, tbb::flow::unlimited,
+            [&](int input) noexcept {
+                processed_items2.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::make_edge(tbb::flow::output_port<0>(split), function1);
+        tbb::flow::make_edge(tbb::flow::output_port<1>(split), function2);
+
+        for (int i = 0; i < wait_message; ++i) {
+            split.try_put(tuple_type{i, i});
+        }
+
+        split.try_put_and_wait(tuple_type{wait_message, wait_message});
+
+        std::size_t check_index1 = 0;
+        std::size_t check_index2 = 0;
+
+        // Since split node broadcasts items to successors from last to first, start_work_items tasks and wait_message would be spawned
+        // in the following order {f2 - 1} - {f1 - 1} {f2 - 2} {f1 - 2} ... {f2 - 10}{f1 - 10}
+        // and processed in reversed order
+        // Hence {f1 - wait_message} task would be processed first and it would spawn tasks for new_work_items in the same order
+        // Since new_work_items tasks would processed first and {f2 - 10} would be still in queue
+        // it is expected that during the try_put_and_wait {f1 - 10} would be processed first, then new_work_items would be processed
+        // and only when {f2 - 10} would be taken and executed, try_put_and_wait would be exitted
+        // All of the other tasks for start_work_items would be processed during wait_for_all()
+        CHECK_MESSAGE(processed_items1[check_index1++] == wait_message, "Unexpected items processing");
+
+        for (std::size_t i = new_work_items.size(); i != 0; --i) {
+            CHECK_MESSAGE(processed_items1[check_index1++] == new_work_items[i - 1], "Unexpected items processing");
+            CHECK_MESSAGE(processed_items2[check_index2++] == new_work_items[i - 1], "Unexpected items processing");
+        }
+
+        CHECK_MESSAGE(processed_items2[check_index2++] == wait_message, "Unexpected items processing");
+
+        g.wait_for_all();
+
+        for (std::size_t i = start_work_items.size(); i != 0; --i) {
+            CHECK_MESSAGE(processed_items1[check_index1++] == start_work_items[i - 1], "Unexpected items processing");
+            CHECK_MESSAGE(processed_items2[check_index2++] == start_work_items[i - 1], "Unexpected items processing");
+        }
+    });
+}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
 //! Test output ports and message passing with different input tuples
 //! \brief \ref requirement \ref error_guessing
 TEST_CASE("Tuple tests"){
@@ -446,3 +523,9 @@ TEST_CASE("Deduction guides"){
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test split_node try_put_and_wait") {
+    test_try_put_and_wait();
+}
+#endif
diff --git a/third-party/tbb/test/tbb/test_tagged_msg.cpp b/third-party/tbb/test/tbb/test_tagged_msg.cpp
index 656f0d3e89..520ecda9c2 100644
--- a/third-party/tbb/test/tbb/test_tagged_msg.cpp
+++ b/third-party/tbb/test/tbb/test_tagged_msg.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -54,7 +54,7 @@ typedef tbb::flow::tagged_msg<size_t, int, char, double, odd_array_type, odder_a
 
 // test base of tagged_msg
 void TestWrapper() {
-    using tbb::detail::d1::Wrapper;
+    using tbb::detail::d2::Wrapper;
     Wrapper<int> wi(42);
     Wrapper<int> wic(23);
 
diff --git a/third-party/tbb/test/tbb/test_task.cpp b/third-party/tbb/test/tbb/test_task.cpp
index 876e351006..6c2060a69a 100644
--- a/third-party/tbb/test/tbb/test_task.cpp
+++ b/third-party/tbb/test/tbb/test_task.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -30,7 +30,7 @@
 
 #include <atomic>
 #include <thread>
-#include <thread>
+#include <deque>
 
 //! \file test_task.cpp
 //! \brief Test for [internal] functionality
@@ -840,3 +840,65 @@ TEST_CASE("Check correct arena destruction with enqueue") {
         tbb::finalize(handle, std::nothrow_t{});
     }
 }
+
+//! \brief \ref regression
+TEST_CASE("Try to force Leaked proxy observers warning") {
+    int num_threads = std::thread::hardware_concurrency() * 2;
+    tbb::global_control gc(tbb::global_control::max_allowed_parallelism, num_threads);
+    tbb::task_arena arena(num_threads, 0);
+    std::deque<tbb::task_scheduler_observer> observers;
+    for (int i = 0; i < 1000; ++i) {
+        observers.emplace_back(arena);
+    }
+
+    for (auto& observer : observers) {
+        observer.observe(true);
+    }
+
+    arena.enqueue([] {
+        tbb::parallel_for(0, 100000, [] (int) {
+            utils::doDummyWork(1000);
+        });
+    });
+}
+
+//! \brief \ref error_guessing
+TEST_CASE("Force thread limit on per-thread reference_vertex") {
+    int num_threads = std::thread::hardware_concurrency();
+    int num_groups = 1000;
+
+    // Force thread limit on per-thread reference_vertex
+    std::vector<tbb::task_group> groups(num_groups);
+    tbb::parallel_for(0, num_threads, [&] (int) {
+        std::vector<tbb::task_group> local_groups(num_groups);
+        for (int i = 0; i < num_groups; ++i) {
+            groups[i].run([] {});
+            local_groups[i].run([] {});
+            local_groups[i].wait();
+        }
+    }, tbb::static_partitioner{});
+
+    // Enforce extra reference on each task_group
+    std::deque<tbb::task_handle> handles{};
+    for (int i = 0; i < num_groups; ++i) {
+        handles.emplace_back(groups[i].defer([] {}));
+    }
+
+    // Check correctness of the execution
+    tbb::task_group group;
+
+    std::atomic<int> final_sum{};
+    for (int i = 0; i < num_groups; ++i) {
+        group.run([&] { ++final_sum; });
+    }
+    group.wait();
+    REQUIRE_MESSAGE(final_sum == num_groups, "Some tasks were not executed");
+
+    for (int i = 0; i < num_groups; ++i) {
+        groups[i].run(std::move(handles[i]));
+    }
+
+    for (int i = 0; i < num_groups; ++i) {
+        groups[i].wait();
+    }
+}
diff --git a/third-party/tbb/test/tbb/test_task_arena.cpp b/third-party/tbb/test/tbb/test_task_arena.cpp
index fd930f1995..6bd93d4c0e 100644
--- a/third-party/tbb/test/tbb/test_task_arena.cpp
+++ b/third-party/tbb/test/tbb/test_task_arena.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -1941,6 +1941,8 @@ TEST_CASE("Stress test with mixing functionality") {
     StressTestMixFunctionality();
 }
 
+// global_control::max_allowed_parallelism functionality is not covered by TCM
+#if !__TBB_TCM_TESTING_ENABLED
 //! \brief \ref stress
 TEST_CASE("Workers oversubscription") {
     std::size_t num_threads = utils::get_platform_max_threads();
@@ -1977,6 +1979,7 @@ TEST_CASE("Workers oversubscription") {
         );
     });
 }
+#endif
 
 #if TBB_USE_EXCEPTIONS
 //! The test for error in scheduling empty task_handle
diff --git a/third-party/tbb/test/tbb/test_task_group.cpp b/third-party/tbb/test/tbb/test_task_group.cpp
index d39b4fc703..5ad8355a15 100644
--- a/third-party/tbb/test/tbb/test_task_group.cpp
+++ b/third-party/tbb/test/tbb/test_task_group.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -397,7 +397,7 @@ class test_exception : public std::exception
 public:
     test_exception ( const char* descr ) : m_strDescription(descr) {}
 
-    const char* what() const throw() override { return m_strDescription; }
+    const char* what() const noexcept override { return m_strDescription; }
 };
 
 using TestException = test_exception;
@@ -780,8 +780,11 @@ TEST_CASE("Thread safety test for the task group") {
 TEST_CASE("Fibonacci test for the task group") {
     for (unsigned p=MinThread; p <= MaxThread; ++p) {
         tbb::global_control limit(tbb::global_control::max_allowed_parallelism, p);
+        tbb::task_arena a(p);
         g_MaxConcurrency = p;
-        RunFibonacciTests<tbb::task_group>();
+        a.execute([] {
+            RunFibonacciTests<tbb::task_group>();
+        });
     }
 }
 
@@ -838,7 +841,10 @@ TEST_CASE("Thread safety test for the isolated task group") {
         }
         tbb::global_control limit(tbb::global_control::max_allowed_parallelism, p);
         g_MaxConcurrency = p;
-        TestThreadSafety<tbb::isolated_task_group>();
+        tbb::task_arena a(p);
+        a.execute([] {
+            TestThreadSafety<tbb::isolated_task_group>();
+        });
     }
 }
 #endif
@@ -849,7 +855,10 @@ TEST_CASE("Fibonacci test for the isolated task group") {
     for (unsigned p=MinThread; p <= MaxThread; ++p) {
         tbb::global_control limit(tbb::global_control::max_allowed_parallelism, p);
         g_MaxConcurrency = p;
-        RunFibonacciTests<tbb::isolated_task_group>();
+        tbb::task_arena a(p);
+        a.execute([] {
+            RunFibonacciTests<tbb::isolated_task_group>();
+        });
     }
 }
 
@@ -859,7 +868,10 @@ TEST_CASE("Cancellation and exception test for the isolated task group") {
     for (unsigned p=MinThread; p <= MaxThread; ++p) {
         tbb::global_control limit(tbb::global_control::max_allowed_parallelism, p);
         g_MaxConcurrency = p;
-        RunCancellationAndExceptionHandlingTests<tbb::isolated_task_group>();
+        tbb::task_arena a(p);
+        a.execute([] {
+            RunCancellationAndExceptionHandlingTests<tbb::isolated_task_group>();
+        });
     }
 }
 
@@ -869,7 +881,10 @@ TEST_CASE("Constant functor test for the isolated task group") {
     for (unsigned p=MinThread; p <= MaxThread; ++p) {
         tbb::global_control limit(tbb::global_control::max_allowed_parallelism, p);
         g_MaxConcurrency = p;
-        TestConstantFunctorRequirement<tbb::isolated_task_group>();
+        tbb::task_arena a(p);
+        a.execute([] {
+            TestConstantFunctorRequirement<tbb::isolated_task_group>();
+        });
     }
 }
 
@@ -879,7 +894,10 @@ TEST_CASE("Move semantics test for the isolated task group") {
     for (unsigned p=MinThread; p <= MaxThread; ++p) {
         tbb::global_control limit(tbb::global_control::max_allowed_parallelism, p);
         g_MaxConcurrency = p;
-        TestMoveSemantics<tbb::isolated_task_group>();
+        tbb::task_arena a(p);
+        a.execute([] {
+            TestMoveSemantics<tbb::isolated_task_group>();
+        });
     }
 }
 
@@ -1204,4 +1222,3 @@ TEST_CASE("task_handle cannot be scheduled into other task_group of the same con
 }
 
 #endif // TBB_USE_EXCEPTIONS
-
diff --git a/third-party/tbb/test/tbb/test_write_once_node.cpp b/third-party/tbb/test/tbb/test_write_once_node.cpp
index 2bb16383f8..6fb716bab0 100644
--- a/third-party/tbb/test/tbb/test_write_once_node.cpp
+++ b/third-party/tbb/test/tbb/test_write_once_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -207,6 +207,135 @@ void test_deduction_guides() {
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+void test_try_put_and_wait() {
+    int wait_message = 0;
+    int occupy_concurrency_message = 1;
+    int new_message = 2;
+
+    // Test push
+    {
+        tbb::task_arena arena(1);
+
+        std::vector<int> processed_items;
+
+        arena.execute([&] {
+            tbb::flow::graph g;
+
+            tbb::flow::write_once_node<int> wo_buffer(g);
+            tbb::flow::function_node<int, int> function(g, tbb::flow::serial,
+                [&](int input) {
+                    if (input == wait_message) {
+                        wo_buffer.clear();
+                        wo_buffer.try_put(new_message);
+                    }
+                    processed_items.emplace_back(input);
+                    return 0;
+                });
+
+            tbb::flow::make_edge(wo_buffer, function);
+
+            wo_buffer.try_put_and_wait(wait_message);
+
+            std::size_t check_index = 0;
+            CHECK_MESSAGE(processed_items.size() == 1, "Only the wait_message should be processed");
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Only the wait_message should be processed");
+
+            g.wait_for_all();
+
+            CHECK_MESSAGE(processed_items[check_index++] == new_message,
+                          "only the new_message should be processed in wait_for_all");
+            CHECK(check_index == processed_items.size());
+        });
+    }
+    // Test pull
+    {
+        std::vector<int> processed_items;
+        tbb::task_arena arena(1);
+
+        arena.execute([&] {
+            tbb::flow::graph g;
+
+            tbb::flow::write_once_node<int> wo_buffer(g);
+            tbb::flow::function_node<int, int, tbb::flow::rejecting> function(g, tbb::flow::serial,
+                [&](int input) {
+                    if (input == new_message || input == wait_message) {
+                        wo_buffer.clear();
+                    }
+
+                    if (input == wait_message) {
+                        wo_buffer.try_put(new_message);
+                    }
+                    processed_items.emplace_back(input);
+                    return 0;
+                });
+
+            tbb::flow::make_edge(wo_buffer, function);
+
+            function.try_put(occupy_concurrency_message);
+            wo_buffer.try_put_and_wait(wait_message);
+
+            std::size_t check_index = 0;
+            CHECK_MESSAGE(processed_items.size() == 2, "unexpected message processing for try_put_and_wait");
+            CHECK_MESSAGE(processed_items[check_index++] == occupy_concurrency_message,
+                          "occupy_concurrency_message should be processed first");
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message,
+                          "wait_message was not processed");
+
+            g.wait_for_all();
+
+            CHECK_MESSAGE(processed_items[check_index++] == new_message,
+                          "only the new_message should be processed in wait_for_all");
+            CHECK(check_index == processed_items.size());
+        });
+    }
+    // Test reserve
+    {
+        std::vector<int> processed_items;
+        tbb::task_arena arena(1);
+
+        arena.execute([&] {
+            tbb::flow::graph g;
+
+            tbb::flow::write_once_node<int> wo_buffer(g);
+            tbb::flow::limiter_node<int, int> limiter(g, 1);
+            tbb::flow::function_node<int, int, tbb::flow::rejecting> function(g, tbb::flow::serial,
+                [&](int input) {
+                    if (input == new_message || input == wait_message) {
+                        wo_buffer.clear();
+                    }
+
+                    if (input == wait_message) {
+                        wo_buffer.try_put(new_message);
+                    }
+                    processed_items.emplace_back(input);
+                    limiter.decrementer().try_put(1);
+                    return 0;
+                });
+
+            tbb::flow::make_edge(wo_buffer, limiter);
+            tbb::flow::make_edge(limiter, function);
+
+            limiter.try_put(occupy_concurrency_message);
+            wo_buffer.try_put_and_wait(wait_message);
+
+            std::size_t check_index = 0;
+            CHECK_MESSAGE(processed_items.size() == 2, "unexpected message processing for try_put_and_wait");
+            CHECK_MESSAGE(processed_items[check_index++] == occupy_concurrency_message,
+                          "occupy_concurrency_message should be processed first");
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message,
+                          "wait_message was not processed");
+
+            g.wait_for_all();
+
+            CHECK_MESSAGE(processed_items[check_index++] == new_message,
+                          "only the new_message should be processed in wait_for_all");
+            CHECK(check_index == processed_items.size());
+        });
+    }
+}
+#endif
+
 //! Test read-write properties
 //! \brief \ref requirement \ref error_guessing
 TEST_CASE("Read-write tests"){
@@ -244,3 +373,10 @@ TEST_CASE("Deduction guides"){
     test_deduction_guides();
 }
 #endif
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test write_once_node try_put_and_wait") {
+    test_try_put_and_wait();
+}
+#endif